Пример #1
0
 def test(self,target,modelPath,wordVectors,windowSizeFeatures,useManualFeature,numberEpoch,
          noTestByEpoch,unknownTokens,nmFeatureSentenceToPrint = 2):
     
     sleep(5)
 
     tagger = crfsuite.Tagger()
     tagger.open(modelPath)
     
     total = 0
     numberCorrect = 0;
     
     self.__logger.info( "Comecando teste: " + time.ctime())
     for xseq, yseqcor in self.__instances(target, wordVectors, windowSizeFeatures, useManualFeature):
         # Tag the sequence.
         if nmFeatureSentenceToPrint > 0:
             self.__printFeatures(xseq, yseqcor)
             nmFeatureSentenceToPrint-=1
 
         tagger.set(xseq)
         
         
         # Obtain the label sequence predicted by the tagger.
         yseqpred = tagger.viterbi()
         
         for cor, pred in  itertools.izip(yseqcor, yseqpred):
 #             logger.info("cor | pred : " + cor + " | " + pred);
             if cor == pred:
                 numberCorrect += 1
             total += 1
     
     
     return (numberCorrect,total)
Пример #2
0
def build_entity_dict_proc(shared_dict, keys):
    proc_name = current_process().name
    print(proc_name + ': initializing')

    tagger = crfsuite.Tagger()
    tagger.open(crf_model_path)
    feat_extractor = FeatureExtractor(use_models=False)
    print(proc_name + ': tagger and feature extractor loaded')

    ents = OrderedDict()

    for ii, key in enumerate(keys):
        sents = shared_dict[key]
        if key not in ents:
            ents[key] = []
        for sent in sents:
            ents[key].extend(extract_entities(tagger, feat_extractor, sent))
        ents[key] = list(OrderedDict.fromkeys(ents[key]))
        # sampling/backup
        if ii % 50000 == 0:
            print(proc_name + ':', ii)
            dump_dict(proc_name + '_', ents, ii)

    dump_dict(proc_name + '_end', ents, len(keys))
    print(proc_name + ':', len(keys))
    print(proc_name + ': terminated')
Пример #3
0
def main(feature_extractor,
         fields='w pos y',
         sep=' ',
         fi=sys.stdin,
         fo=sys.stdout):
    #fi = sys.stdin
    #fo = sys.stdout

    # Parse the command-line arguments.
    parser = optparse.OptionParser(usage="""usage: %prog [options]
This utility reads a data set from STDIN, and outputs attributes to STDOUT.
Each line of a data set must consist of field values separated by SEPARATOR
characters. The names and order of field values can be specified by -f option.
The separator character can be specified with -s option. Instead of outputting
attributes, this utility tags the input data when a model file is specified by
-t option (CRFsuite Python module must be installed).""")
    parser.add_option(
        '-t',
        dest='model',
        help='tag the input using the model (requires "crfsuite" module)')
    parser.add_option(
        '-f',
        dest='fields',
        default=fields,
        help='specify field names of input data [default: "%default"]')
    parser.add_option(
        '-s',
        dest='separator',
        default=sep,
        help=
        'specify the separator of columns of input data [default: "%default"]')
    (options, args) = parser.parse_args()

    # The fields of input: ('w', 'pos', 'y) by default.
    F = options.fields.split(' ')

    if not options.model:
        # The generator function readiter() reads a sequence from a
        for X in readiter(fi, F, options.separator):
            feature_extractor(X)
            output_features(fo, X, 'y')

    else:
        # Create a tagger with an existing model.
        import crfsuite
        tagger = crfsuite.Tagger()
        tagger.open(options.model)

        # For each sequence from STDIN.
        for X in readiter(fi, F, options.separator):
            # Obtain features.
            feature_extractor(X)
            xseq = to_crfsuite(X)
            yseq = tagger.tag(xseq)
            for t in range(len(X)):
                v = X[t]
                fo.write('\t'.join([v[f] for f in F]))
                fo.write('\t%s\n' % yseq[t])
            fo.write('\n')
Пример #4
0
def excute(excelfile, resultfile):

    tagger_b = crfsuite.Tagger()
    tagger_b.open(model_b)
    tagger_c = crfsuite.Tagger()
    tagger_c.open(model_c)

    print model_b, model_c
    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']

    if not (excelfile.endswith('.xls') or excelfile.endswith('.xlsx')):
        print '请输入正确的文件格式。'
        exit(1)

    data = xlrd.open_workbook(excelfile)
    workbook = xlwt.Workbook()
    for sheeti in range(len(data.sheets())):
        table = data.sheets()[sheeti]
        nrows = table.nrows

        if resultfile.endswith('.xls') or resultfile.endswith('.xlsx'):
            sheet = workbook.add_sheet(table.name)

            style = xlwt.easyxf('font: bold 1;')
            sheet.write(0, 0, u'原始数据', style)
            sheet.write(0, 1, u'实体识别结果', style)

            print 'generating %s...' % table.name
            for i in range(1, nrows):

                line = table.row_values(i)[0]
                new_line = tools.uniformSignal(line)
                print 'new line: ', new_line
                sheet.write(i, 0, unicode(line))
                sheet.write(
                    i, 1,
                    unicode(getNerResult(new_line, tagger_b, tagger_c, bieso)))
        else:
            print '请输入正确的文件格式。'
            exit(1)
    workbook.save(resultfile)
Пример #5
0
    def fit(self, X, y):
        # For a CRF, X is an iterable of lists of lists of features (=strings)
        # and y is a list of list of token labels (=strings)
        for features_iter, labels in zip(X, y):
            items = ItemSequence(features_iter, check=True)
            self.trainer.append(items, tuple(labels), 0)

        self.model_filepath = tempfile.NamedTemporaryFile(delete=False).name
        self.trainer.train(self.model_filepath, -1)
        # persist to file and pull it back out.
        self.tagger = crfsuite.Tagger()
        self.tagger.open(self.model_filepath)
Пример #6
0
    def from_file(cls, model_filepath):
        '''If we are given a model_filepath that points to an existing file, use it.
        otherwise, create a temporary file to store the model because CRFSuite
        doesn't seem to allow us to create a tagger directly from a trained
        trainer object.'''
        # cls = CRF, obviously
        crf = cls()
        crf.tagger = crfsuite.Tagger()
        logger.debug('Loading existing model from %s', model_filepath)
        crf.tagger.open(model_filepath)
        crf.model_filepath = model_filepath

        return crf
Пример #7
0
def crf_tag(crf_model_file, crf_test_file, feature_list_file, output_file, options_dict):

    output = open(output_file, 'w')
	# Create a tagger object.
    tagger = crfsuite.Tagger()
    
    # Load the model to the tagger.
    tagger.open(crf_model_file)

    output.write("Prediction,True Label\n")
    feature_inclusion_list = crf_train.get_feature_inclusion_list(feature_list_file)

    read_file_to_crfsuite(crf_test_file,feature_inclusion_list, tagger,output, options_dict)
    """
Пример #8
0
def selectActiveData(unselected, selected, model, num):
    select = []
    tagger_bp = crfsuite.Tagger()
    tagger_bp.open(model)
    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']
    entropy = []
    for line in unselected:
        sentence, entities_in_sentence = processing.generateSenEntities(
            line, '')
        new_sentence, new_entities = processing.symbolProcess(
            sentence, entities_in_sentence)
        sentence_unicode = new_sentence.decode('utf-8')
        tag_seq = processing.generateTagSeq(sentence_unicode, new_entities)
        feature_string, tags = generateFeature.boundaryFeatureGeneration(
            new_sentence, [], ebao_dic, 'demo', '0')
        try:
            instances = feature_string.strip().split('\n')
        except AttributeError as e:
            print 'feature_string:%s.' % line
        xseq = crfsuite.ItemSequence()
        features = []
        for instance in instances:
            fields = instance.split('\t')
            features.append(fields[2:])
            item = crfsuite.Item()
            for field in fields[2:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
        tagger_bp.set(xseq)

        yseq_b = tagger_bp.viterbi()
        length = len(yseq_b)

        yseq = []
        ie_entity = 0.0
        for i in range(length):
            yseq.append(yseq_b[i])

        for j in range(len(yseq)):
            ie = 0.0  # 信息熵
            for ent_tag in bieso:
                try:
                    tag_prob = tagger_bp.marginal(ent_tag, j)
                    ie += tag_prob * math.log(1 / tag_prob, 2)
                except Exception, e:
                    print line
                    exit(0)
            ie_entity += ie
        entropy.append((line, ie_entity))
Пример #9
0
def mainfunction(sen, postProcess, texttype, index):
    model_b = os.path.join(root, './models/boundarymodel-' + index)
    model_c = os.path.join(root, './models/classmodel-' + index)

    ner_lines = ''

    tagger_b = crfsuite.Tagger()
    tagger_b.open(model_b)
    tagger_c = crfsuite.Tagger()
    tagger_c.open(model_c)
    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']

    line = sen.strip()

    # model_2_layer
    # boundary
    feature_string = ''
    instances = []
    feature_string, tags = generateFeature.boundaryFeatureGeneration(
        line, [], ebao_dic, 'demo', '0')
    try:
        instances = feature_string.strip().split('\n')
    except AttributeError as e:
        print 'feature_string:%s.' % feature_string
    xseq = crfsuite.ItemSequence()
    for instance in instances:
        fields = instance.split('\t')
        item = crfsuite.Item()
        for field in fields[2:]:
            item.append(crfsuite.Attribute(field))
        xseq.append(item)
    tagger_b.set(xseq)
    yseq_b = tagger_b.viterbi()

    line_unicode = line.decode('utf-8')

    model_chosen = '2layer'
    # class
    sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b])

    length = len(sen_ent_list1[0])
    # length 为0时
    entities = []
    new_entities = []
    for j in range(length):
        ent_start = sen_ent_list1[0][j][0]
        ent_end = sen_ent_list1[0][j][1]
        ent_type = sen_ent_list1[0][j][2]
        ent_content = line_unicode[ent_start:ent_end].encode('utf-8')
        entities.append(Entity(ent_content, ent_start, ent_end, ent_type))
    feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
        line, entities, ebao_dic, texttype)
    instances = feature_c.strip().split('\n\n')
    ents_type = []
    for instance in instances:
        xseq = crfsuite.ItemSequence()
        fields = instance.split('\t')
        item = crfsuite.Item()
        for field in fields[1:]:
            item.append(crfsuite.Attribute(field))
        xseq.append(item)
        tagger_c.set(xseq)
        yseq_c = tagger_c.viterbi()
        ents_type.append(yseq_c[0])
    # postProcessing

    new_yseq = ['O' for i in range(len(line_unicode))]
    for j in range(len(entities)):
        start = entities[j].start_pos
        end = entities[j].end_pos
        enttype = ents_type[j]
        if start + 1 == end:
            new_yseq[start] = 'S-' + enttype
            continue
        new_yseq[start] = 'B-' + enttype
        for k in range(start + 1, end - 1):
            new_yseq[k] = 'I-' + enttype
        new_yseq[end - 1] = 'E-' + enttype

    if postProcess == '1':  # 评价中的start_end_list没有调整
        new_yseq = postProcessing.twoProcessings(line_unicode, new_yseq,
                                                 ebao_dic, texttype)

    ents1, s_e_list1 = evaluation.generateEntList([new_yseq])
    new_entities = ents1[0]

    entity_list = ''
    length = len(new_entities)
    for i in range(length):
        content = line_unicode[new_entities[i][0]:new_entities[i][1]]
        enttype = new_entities[i][2]
        if enttype == '':
            print line_unicode.encode('utf8'), line_unicode[
                new_entities[i][0]:new_entities[i][1]].encode('utf8')
        entity_list += content.encode(
            'utf8') + '[' + en_cn_dic[enttype] + ']\n'
    return entity_list, new_yseq
Пример #10
0
    'medicine_pn': '药品-产品名',
    'medicine_mn': '药品-商品名',
    'dosage_form': '剂型',
    'specifications': '规格',
    'packing_spe': '包装规格',
    'packing_material': '包材',
    'enterprise': '企业机构',
    'department': '科室',
    'address': '地址',
    'healthy_food': '保健食品',
    'social_insurance': '社保',
}

cn_en_dic = dict((y, x) for x, y in en_cn_dic.iteritems())

tagger_b_defalt = crfsuite.Tagger()
tagger_b_defalt.open(os.path.join(root, './models/boundarymodel-6'))
tagger_c_defalt = crfsuite.Tagger()
tagger_c_defalt.open(os.path.join(root, './models/classmodel-6'))

tagger_b_jsd = crfsuite.Tagger()
tagger_b_jsd.open(os.path.join(root, './models/boundarymodel-4'))
tagger_c_jsd = crfsuite.Tagger()
tagger_c_jsd.open(os.path.join(root, './models/classmodel-4'))

tagger_b_un = crfsuite.Tagger()
tagger_b_un.open(os.path.join(root, './models/boundarymodel-5'))
tagger_c_un = crfsuite.Tagger()
tagger_c_un.open(os.path.join(root, './models/classmodel-5'))

tagger_b_si = crfsuite.Tagger()
Пример #11
0
            if p == -1:
            	# Unweighted (weight=1) attribute.
                item.append(crfsuite.Attribute(field))
            else:
            	# Weighted attribute
                item.append(crfsuite.Attribute(field[:p], float(field[p+1:])))

        # Append the item to the item sequence.
        xseq.append(item)

if __name__ == '__main__':
    fi = sys.stdin
    fo = sys.stdout

	# Create a tagger object.
    tagger = crfsuite.Tagger()
    
    # Load the model to the tagger.
    tagger.open(sys.argv[1])

    for xseq in instances(fi):
    	# Tag the sequence.
        tagger.set(xseq)
        # Obtain the label sequence predicted by the tagger.
        yseq = tagger.viterbi()
        # Output the probability of the predicted label sequence.
        print tagger.probability(yseq)
        for t, y in enumerate(yseq):
        	# Output the predicted labels with their marginal probabilities.
            print '%s:%f' % (y, tagger.marginal(y, t))
        print
Пример #12
0
def main(feature_extractor, fields='w pos y', sep=' '):
    fi = sys.stdin
    fo = sys.stdout

    # Parse the command-line arguments.
    parser = optparse.OptionParser(usage="""usage: %prog [options]
This utility reads a data set from STDIN, and outputs attributes to STDOUT.
Each line of a data set must consist of field values separated by SEPARATOR
characters. The names and order of field values can be specified by -f option.
The separator character can be specified with -s option. Instead of outputting
attributes, this utility tags the input data when a model file is specified by
-t option (CRFsuite Python module must be installed).""")
    parser.add_option(
        '-t',
        dest='model',
        help='tag the input using the model (requires "crfsuite" module)')
    parser.add_option(
        '-f',
        dest='fields',
        default=fields,
        help='specify field names of input data [default: "%default"]')
    parser.add_option(
        '-s',
        dest='separator',
        default=sep,
        help=
        'specify the separator of columns of input data [default: "%default"]')
    (options, args) = parser.parse_args()
    #print options
    #print args

    # The fields of input: ('w', 'pos', 'y) by default.
    F = options.fields.split(' ')
    #print F

    if not options.model:
        # The generator function readiter() reads a sequence from a
        X = []
        for idx, line in enumerate(fi.readlines()):
            line = line.strip('\n').decode("utf8")
            if line == '':
                feature_extractor(X)
                output_features(fo, X, 'y')
                X = []
            else:
                fields = line.split(sep)
                if len(fields) < len(F):
                    raise ValueError('Too few fields (%d) for %r\n%s' %
                                     (len(fields), F, line))
                item = {'F': []}  # 'F' is reserved for features.
                for i in range(len(F)):
                    item[F[i]] = fields[i]
                X.append(item)

        # for X in readiter(fi, F, options.separator):
        #     # print X
        #     feature_extractor(X)
        #     output_features(fo, X, 'y')

    else:
        # Create a tagger with an existing model.
        import crfsuite
        tagger = crfsuite.Tagger()
        tagger.open(options.model)

        # For each sequence from STDIN.
        idx = 1
        for X in readiter(fi, F, options.separator):
            # Obtain features.
            print idx
            feature_extractor(X)
            xseq = to_crfsuite(X)
            yseq = tagger.tag(xseq)
            for t in range(len(X)):
                v = X[t]
                fo.write('\t'.join([v[f] for f in F]))
                fo.write('\t%s\n' % yseq[t])
            fo.write('\n')

            idx = idx + 1
        fo.close()
Пример #13
0
def predictClassAfterBoundaryAndEval(boundary_result, sentence_list,
                                     sen_tags_list, classmodel_file, ebao_dic,
                                     post_processing, texttype):
    tagger = crfsuite.Tagger()
    tagger.open(classmodel_file)
    result_tags_list = evaluation.generateTagList(boundary_result)
    # 1是系统结果,2是标准数据
    sen_ent_list1, start_end_list1 = evaluation.generateEntList(
        result_tags_list)  # 只有一个entity类
    sen_ent_list2, start_end_list2 = evaluation.generateEntList(
        sen_tags_list)  # 多个类别
    length = len(sen_ent_list1)
    new_sen_ent_list1 = []
    sen_inside_ent_list = []
    for i in range(length):
        # 生成对应的实体数组
        sentence = sentence_list[i]
        sentence_unicode = sentence.decode('utf-8')
        entities = []
        new_entities = []
        s_e_list = []
        if len(sen_ent_list1[i]) == 0:
            sen_inside_ent_list.append([['']])
            new_sen_ent_list1.append(sen_ent_list1[i])
            continue
        for j in range(len(sen_ent_list1[i])):
            ent_start = sen_ent_list1[i][j][0]
            ent_end = sen_ent_list1[i][j][1]
            ent_type = sen_ent_list1[i][j][2]
            ent_content = sentence_unicode[ent_start:ent_end].encode('utf-8')
            entities.append(Entity(ent_content, ent_start, ent_end, ent_type))
            s_e_list.append([sentence, ent_content])
        sen_inside_ent_list.append(s_e_list)
        feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
            sentence, entities, ebao_dic, texttype)

        instances = feature_c.strip().split('\n\n')
        ents_type = []
        for instance in instances:
            xseq = crfsuite.ItemSequence()
            fields = instance.split('\t')
            item = crfsuite.Item()
            for field in fields[1:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
            tagger.set(xseq)
            yseq = tagger.viterbi()
            ents_type.append(yseq[0])
        # postProcessing
        if post_processing == '1':  # 评价中的start_end_list没有调整
            new_yseq = ['O' for i in range(len(sentence_unicode))]
            for j in range(len(entities)):
                start = entities[j].start_pos
                end = entities[j].end_pos
                enttype = ents_type[j]
                if start + 1 == end:
                    new_yseq[start] = 'S-' + enttype
                    continue
                new_yseq[start] = 'B-' + enttype
                for k in range(start + 1, end - 1):
                    new_yseq[k] = 'I-' + enttype
                new_yseq[end - 1] = 'E-' + enttype
            new_yseq1 = postProcessing.twoProcessings(sentence_unicode,
                                                      new_yseq, ebao_dic,
                                                      texttype)
            tag_list1 = []
            tag_list1.append(new_yseq1)
            ents1, s_e_list1 = evaluation.generateEntList(tag_list1)
            new_entities = ents1[0]
        else:
            for k in range(len(ents_type)):
                try:
                    new_entities.append((sen_ent_list1[i][k][0],
                                         sen_ent_list1[i][k][1], ents_type[k]))
                except Exception as e:
                    print e
                    print len(sen_ent_list1[i]), len(ents_type)
                    print sentence
                    print feature_c
        new_sen_ent_list1.append(new_entities)

    # 错误分析
    ent_count_result, ent_count_result_o = evaluation.countEntList(
        new_sen_ent_list1, sen_ent_list2, start_end_list1, start_end_list2,
        sen_inside_ent_list)
    evaluation.measurePRF(ent_count_result_o)
Пример #14
0
def semiSupervisedProcessing(model_previous, fsamples, ie_value, ebao_dic):
    tagger_bp = crfsuite.Tagger()
    tagger_bp.open(model_previous)
    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']
    cdd4training_semi = []
    cdd4training_semi_number = 0
    for line in fsamples:
        # 用识别的实体过滤样例
        sentence, entities_in_sentence = generateSenEntities(line)
        new_sentence, new_entities = symbolProcess(sentence,
                                                   entities_in_sentence)
        sentence_unicode = new_sentence.decode('utf-8')
        tag_seq = generateTagSeq(sentence_unicode, new_entities)
        feature_string, tags = generateFeature.boundaryFeatureGeneration(
            new_sentence, [], ebao_dic, 'demo', '0')
        try:
            instances = feature_string.strip().split('\n')
        except AttributeError as e:
            print 'feature_string:%s.' % line
        xseq = crfsuite.ItemSequence()
        features = []
        for instance in instances:
            fields = instance.split('\t')
            features.append(fields[2:])
            item = crfsuite.Item()
            for field in fields[2:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
        tagger_bp.set(xseq)

        yseq_b = tagger_bp.viterbi()
        length = len(yseq_b)

        yseq = []
        for i in range(length):
            yseq.append(yseq_b[i])
        # 标记优化处理
        sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b])
        sen_ent_list2, start_end_list2 = evaluation.generateEntList([tag_seq])
        tagged_ents_length = len(start_end_list1[0])
        if tagged_ents_length == 0: continue

        ents = []
        selected_entity = 0
        ent_index = 0
        for i in range(tagged_ents_length):
            ent_start = start_end_list1[0][i][0]
            if ent_start < ent_index: continue
            flag = 0
            ent_end = start_end_list1[0][i][1]
            ent_content = sentence_unicode[ent_start:ent_end].encode('utf-8')
            ie_entity = 0.0
            for j in range(ent_start, ent_end):
                ie = 0.0  # 信息熵
                for ent_tag in bieso:
                    tag_prob = tagger_bp.marginal(ent_tag, j)
                    ie += tag_prob * math.log(1 / tag_prob, 2)
                ie_entity += ie
            # ie_ave = ie_entity / (ent_end - ent_start)
            # if ebao_dic.has_key(ent_content) and ie_ave > ie_value:
            if ie_entity > ie_value:
                for k in range(len(start_end_list2[0])):
                    start_m = start_end_list2[0][k][0]
                    end_m = start_end_list2[0][k][1]
                    if ent_start >= start_m and ent_end <= end_m:
                        # if end_m - start_m < 3: break
                        ents.append(
                            Entity(
                                sentence_unicode[start_m:end_m].encode(
                                    'utf-8'), int(start_m), int(end_m),
                                'entity'))
                        ent_index = end_m
                        flag = 1
                        break
                if flag == 0:
                    continue

                    if not ebao_dic.has_key(ent_content): continue
                    ents.append(
                        Entity(ent_content, int(ent_start), int(ent_end),
                               'entity'))
                    ent_index = end_m
                selected_entity += 1

        if selected_entity == 0: continue

        char_entity_tag_list = generateFeature.getCharEntityFPTag(
            sentence_unicode, ents, '1')
        char_entity_tag_list = generateFeature.getCharEntityPartialTag(
            char_entity_tag_list)

        new_feature_str = ''
        for j in range(length):
            new_feature_str += '%s\t%s\n' % (char_entity_tag_list[j][1][0],
                                             '\t'.join(features[j]))

        cdd4training_semi.append(new_feature_str.strip())
        cdd4training_semi_number += 1
    return cdd4training_semi, cdd4training_semi_number