Пример #1
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold,
                            training_opt):

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    model_filename = models_folder + "/" + "%i_%s__%s" % (
        fold, "power_set", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor,
                      verbose=False,
                      training_opt=training_opt)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        vd_predictions, regular_tags)

    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
Пример #2
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):
    td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags)
    vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags)

    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999)))

        # documentation: http://www.chokkan.org/software/crfsuite/manual.html
        model = CRFTagger(feature_func=comp_feat_extactor, verbose=False)
        model.train(td, model_filename)

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))
        # Delete model file now predictions obtained
        # Note, we are randomizing name above, so we need to clean up here
        os.remove(model_filename)

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions)
    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
Пример #3
0
 def demo(self, test_sents):
     tagger = CRFTagger(feature_func=self.feature_detector)
     tagger.set_model_file(self.modelpath)
     for sent in test_sents:
         tagged = tagger.tag(untag(sent))
         for s in self._to_sentence(tagged):
             print(s)
     print(tagger.evaluate(test_sents))
Пример #4
0
 def train(self, load_model=None):
     train_set = CRF._fin_data_prep(self.train_set)
     _extract_ftr = self._gen_ftr_func()
     self.model = CRFTagger(_extract_ftr,
                            verbose=False,
                            training_opt={
                                "num_memories": 500,
                                "delta": 1e-8
                            })
     self.model.train(train_set, 'stc_crf_model')
     return self
Пример #5
0
    def pyt_sent_tokenizer(self, paragraph):
        """단락을 문장으로 바꿔주는 함수입니다. 파이테스트용입니다.

        Args:
            paragraph(list(str)): 단락이 리스트 인자로 들어옵니다.

        Returns:
            sentences(list(list(str))): 단락을 문장단위로 잘라서 반환합니다.
        """
        tagger = CRFTagger(feature_func=self.feature_detector)
        tagger.set_model_file(self.modelpath)
        words = re.split('\s+', paragraph.strip())
        tagged = tagger.tag(words)
        return self._to_sentence(tagged)
Пример #6
0
    def batch_sent_tokenizer(self, paragraphs):
        """단락들을 문장으로 바꿔주는 함수입니다.

        Args:
            paragraphs(list(str)): 단락들이 리스트 인자로 들어옵니다.

        Returns:
            sentences(list(str)): 단락을 문장단위로 잘라서 반환합니다.
        """
        tagger = CRFTagger(feature_func=self.feature_detector)
        tagger.set_model_file(self.modelpath)
        sentences = []
        for paragraph in paragraphs:
            words = re.split('\s', paragraph.strip())
            tagged = tagger.tag(words)
            sentences.append(self._to_sentence(tagged))
        return sentences
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    # Start Training
    print("Fold %i Training code" % fold)

    # Important - only compute code frequency from training data (NO CHEATING)
    code_freq = tally_code_frequencies(essays_TD)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags,
                                                    code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags,
                                                    code_freq)

    model_filename = models_folder + "/" + "%i_%s__%s" % (
        fold, "most_freq_code", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor, verbose=False)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset,
                                                      regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset,
                                                      regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        vd_predictions, regular_tags)
    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
Пример #8
0
def main():
    aparser = argparse.ArgumentParser(description='Daba disambiguator')
    aparser.add_argument('-v',
                         '--verbose',
                         help='Verbose output',
                         default=False,
                         action='store_true')
    aparser.add_argument(
        '-l',
        '--learn',
        help='Learn model from data (and save as F if provided)',
        default=None)
    aparser.add_argument('-p',
                         '--pos',
                         help='Prediction for POS',
                         default=False,
                         action='store_true')
    aparser.add_argument('-t',
                         '--tone',
                         help='Prediction for tones',
                         default=False,
                         action='store_true')
    aparser.add_argument('-r', '--root', help='Corpus root dir')
    aparser.add_argument('-f',
                         '--filelist',
                         help='Path to a list of files to learn from')
    # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
    aparser.add_argument(
        '-e',
        '--evalsize',
        type=int,
        default=10,
        help=
        'Percent of training data with respect to training and test one (default 10)'
    )
    aparser.add_argument(
        '-d',
        '--disambiguate',
        help=
        'Use model F to disambiguate data, the gloss list will be ordered by the probability growth order',
        default=None)
    aparser.add_argument(
        '--select',
        help=
        'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.',
        action='store_true')
    aparser.add_argument('-i',
                         '--infile',
                         help='Input file (.html)',
                         default=sys.stdin)
    aparser.add_argument('-o',
                         '--outfile',
                         help='Output file (.html)',
                         default=sys.stdout)
    aparser.add_argument(
        '-s',
        '--store',
        help=
        'Store tagged raw data in file (.csv) for further research purpose',
        default=None)

    args = aparser.parse_args()
    if args.verbose:
        print(args)

    if args.learn and (args.pos or args.tone or args.gloss):

        if not (args.pos or args.tone or args.gloss):
            print('Choose pos, tone, gloss or combination of them')
            exit(0)

        print('Make list of files')
        allfiles = []
        with codecs.open(args.filelist, 'r', encoding="utf-8") as filelist:
            for line in filelist:
                allfiles.append(line.strip())
        allsents = []

        # pour le débogage
        # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html'

        if args.tone:
            try:
                enc = encoder_tones()
            except:
                enc = None
                print(("Error : unable to initialize the tone encoder !"))

        print('Open files and find features / supervision tags')
        for infile in allfiles:
            if (infile):
                print('-', infile)
                sent = []

                html_parser = FileParser()
                html_parser.read_file(os.path.join(args.root, infile))

                for snum, sentence in enumerate(html_parser.glosses):
                    for tnum, token in enumerate(sentence[2]):
                        tag = ''
                        if token.type == 'w' or token.type == 'c':
                            tags = ''
                            if args.pos:
                                tags = '/'.join(token.gloss.ps)
                                wordform = detone(token.gloss.form)
                                sent.append((wordform, tags))
                            elif args.tone:
                                # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ?
                                # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est
                                # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter
                                # une réélle amélioration dans la modélisation de tonalisation. Néanmoins,
                                # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans
                                # les données d'entraînement et d'en observer le apport
                                if '|' not in token.gloss.form:
                                    [codes, chunks] = enc.differential_encode(
                                        token.token, token.gloss.form)
                                    for chunk, code in zip(chunks, codes):
                                        try:
                                            sent.append((chunk, code))
                                        except LookupError:
                                            pass
                            """
                            elif args.gloss:
                                tags += token.gloss.gloss
                                sent.append((token.token, tags))
                            """

                    if len(sent) > 1:
                        allsents.append(sent)
                        sent = []

        if args.verbose and args.tone:
            enc.report()

        # Constitution des ensmebles d'entraînement de d'évaluation
        p = (1 - args.evalsize / 100.0)
        train_set, eval_set = sampling(allsents, p)
        print('Split the data in train (', len(train_set),
              ' sentences) / test (', len(eval_set), ' sentences)')

        print('Building classifier (CRF/NLTK)')
        # Initialization
        t1 = time.time()
        if args.tone:
            num_phases = len([False, True]) * len(mode_indicators)
            myzip = zipfile.ZipFile(args.learn + '.zip', 'w')
        else:
            num_phases = 1

        # Training
        for phase in range(num_phases):
            tagger = CRFTagger(verbose=args.verbose,
                               training_opt={'feature.minfreq': 10})
            trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
            trainer.set_params(tagger._training_options)
            if num_phases > 1:
                model_name = args.learn + '.' + str(phase)
            else:
                model_name = args.learn

            # train_set : list(list((str,list(str))))
            for sent in train_set:
                tokens = unzip(sent)[0]
                labels = unzip(sent)[1]
                if num_phases > 1:
                    for lab in labels:
                        pass
                    labels = [
                        code_dispatcher(label)[phase] for label in labels
                    ]
                features = [
                    _get_features_customised_for_tones(tokens, i)
                    for i in range(len(tokens))
                ]
                trainer.append(features, labels)
            trainer.train(model=model_name)
            if num_phases > 1:
                myzip.write(model_name)
                os.remove(model_name)
        if num_phases > 1:
            myzip.close()

        print("... done in", get_duration(t1_secs=t1, t2_secs=time.time()))

        # Evaluation
        print('Evaluating classifier')
        # gold_set, predicted_set : list(list((str, str)))
        # input_set, output_gold_set : list(list(str))
        gold_set = eval_set
        input_set = [unzip(sent)[0] for sent in gold_set]
        predicted_set = [list() for sent in gold_set]
        if num_phases > 1:
            myzip = zipfile.ZipFile(args.learn + '.zip', 'r')
        for phase in range(num_phases):
            tagger = CRFTagger(verbose=args.verbose,
                               training_opt={'feature.minfreq': 10})
            trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
            trainer.set_params(tagger._training_options)
            if num_phases > 1:
                model_name = args.learn + '.' + str(phase)
                myzip.extract(model_name)
            else:
                model_name = args.learn
            tagger.set_model_file(model_name)
            for i, sent in enumerate(input_set):
                features = [
                    _get_features_customised_for_tones(sent, j)
                    for j in range(len(sent))
                ]
                labels = tagger._tagger.tag(features)
                if num_phases > 1:
                    labels = [
                        code_dispatcher(label)[phase] for label in labels
                    ]
                tagged_sent = list(zip(sent, labels))
                if not predicted_set[i]:
                    predicted_set[i] = tagged_sent
                else:
                    sent_acc, labels_acc = unzip(predicted_set[i])
                    labels_acc = [
                        label_acc + label
                        for label_acc, label in zip(labels_acc, labels)
                    ]
                    predicted_set[i] = list(zip(sent_acc, labels_acc))
            if num_phases > 1:
                os.remove(model_name)
                myzip.close()

        # gold_tokens, predicted_tokens : list((str,str))
        predicted_tokens = list(itertools.chain(*predicted_set))
        if num_phases > 1:
            predicted_tokens = [
                tuple([pair[0], code_resort(pair[1])])
                for pair in predicted_tokens
            ]
        gold_tokens = list(itertools.chain(*gold_set))
        # gold_tokens_eval, predicted_tokens_eval : list(str)
        if args.tone:
            gold_tokens_eval = getTag(gold_tokens)
            predicted_tokens_eval = getTag(predicted_tokens)
        else:
            gold_tokens_eval = gold_tokens
            predicted_tokens_eval = predicted_tokens

        if args.store and args.tone:
            stored_filename = args.store
            csv_export(enc, stored_filename, gold_tokens, predicted_tokens)

        print("Accuracy : {:>5.3f}".format(
            accuracy(gold_tokens_eval, predicted_tokens_eval)))

        if args.verbose and args.store:
            print(("Tagged result is exported in {}".format(args.store)))

    elif args.disambiguate and args.infile and args.outfile:
        # Lecture de texte en .HTML
        html_parser = FileParser()
        tagger = CRFTagger()

        if args.pos:
            try:
                tagger.set_model_file(args.disambiguate)
            except IOError:
                print("Error : unable to open the model {} !".format(
                    args.infile))
                exit(1)
            try:
                html_parser.read_file(args.infile)
            except IOError:
                print("Error : unable to open the input file {} !".format(
                    args.infile))
                exit(1)

            # Exportation du résultat de désambiguïsation en .HTML
            for snum, sentence in enumerate(html_parser.glosses):
                tokens = [token.token for token in sentence[2]]
                features = [
                    _get_features_customised_for_tones(tokens, i)
                    for i in range(len(tokens))
                ]
                tagger._tagger.set(features)
                for tnum, token in enumerate(sentence[2]):
                    options = list()
                    if token.value and len(token.value) > 2:
                        for nopt, option in enumerate(token.value[2]):
                            try:
                                tag = option.ps[0]
                            except IndexError:
                                tag = ''
                            prob = tagger._tagger.marginal(tag, tnum)
                            options.append((prob, option))
                        reordered_probs, reordered_options = unzip(
                            sorted(options, reverse=True))
                        if args.select:
                            prob_max = reordered_probs[0]
                            reordered_options = tuple([
                                reordered_options[i]
                                for i, p in enumerate(reordered_probs)
                                if p >= prob_max
                            ])
                        html_parser.glosses[snum][1][tnum] = reordered_options

        elif args.tone:
            pass

        try:
            html_parser.write(args.outfile)
        except IOError:
            print("Error : unable to create the output file {}".format(
                args.outfile))

    else:
        aparser.print_help()
    exit(0)
Пример #9
0
def main():
	
	aparser = argparse.ArgumentParser(description='Daba disambiguator')
	# aparser.add_argument('-i', '--infile', help='Input file (.html)', default="sys.stdin")
	# aparser.add_argument('-o', '--outfile', help='Output file (.html)', default="sys.stdout")
	aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None)
	aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true')
	aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true')
	aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
	aparser.add_argument('-e', '--evalsize', help='Percent of randomized data to use for evaluation (default 10)', default=10)
	aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true')
	args = aparser.parse_args()

	if args.learn:

		if not args.pos or args.tone or args.gloss:
			print 'Choose pos, tone, gloss or combination of them'
			exit(0)

		print 'Make list of files'
		files1 = glob.iglob("../corbama/*/*.dis.html")
		files2 = glob.iglob("../corbama/*.dis.html")
		allfiles = ""
		for file1, file2 in zip(files1, files2):
			allfiles += file1+','+file2+','
		allsents = []

		print 'Open files and find features / supervision tags'
		for infile in allfiles.split(','):
			if(len(infile)) :
				print '-', infile
				sent = []
				in_handler = formats.HtmlReader(infile, compatibility_mode=False)
				for token in in_handler:
					tag = ''
					if token.type == 'w' or token.type == 'c':
						tags = ''
						if args.pos:
							for ps in token.gloss.ps:
								tags += ps
						if args.tone:
							tags += token.gloss.form.encode('utf-8')
						if args.gloss:
							tags += token.gloss.gloss.encode('utf-8')
						sent.append((token.token, tags))
					if token.type == 'c' and token.token in ['.', '?', '!']:
						if len(sent) > 1:
							allsents.append(sent)
						sent = []

		datalength = len(allsents)
		p = (1-args.evalsize/100.0)
		print 'Randomize and split the data in train (', int(p*datalength),' sentences) / test (', int(datalength-p*datalength),' sentences)'
		random.seed(123456)
		random.shuffle(allsents)
		train_set = allsents[:int(p*datalength)]
		test_set = allsents[int(p*datalength):datalength]

		print 'Building classifier (CRF/NLTK)'
		tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
		t1 = time.time()
		tagger.train(train_set, args.learn)
		t2 = time.time()
		texec = t2-t1
		print "... done in",  time.strftime('%H %M %S', time.localtime(texec))

		print 'Evaluating classifier'
		print tagger.evaluate(test_set)

		if args.verbose:
			print 'Compute detailed output'

	else:
		print 'USE...'
		parser.print_help()

	exit(0)
Пример #10
0
def main():

    aparser = argparse.ArgumentParser(
        description=u'Tonalizer - CRF-based Tone Reconstitution Tool')
    aparser.add_argument('-v',
                         '--verbose',
                         help='Verbose output',
                         default=False,
                         action='store_true')
    aparser.add_argument(
        '-l',
        '--learn',
        help='Learn model from diacritized text (and save as file if provided)',
        default=None,
        type=lambda s: unicode(s, 'utf8'))
    aparser.add_argument(
        '-e',
        '--evalsize',
        help=
        'Percent of training data with respect to training and test one (default 10)',
        default=10,
        type=float)
    #aparser.add_argument('-c', '--chunkmode', help='Word segmentation width (default 3)', default=3, type=int)
    aparser.add_argument('-d',
                         '--diacritize',
                         help='Use model file to diacritize a raw text',
                         default=None)
    aparser.add_argument('-u',
                         '--undiacritize',
                         help='Undiacritize a raw text',
                         default=False,
                         action='store_true')
    aparser.add_argument('-f',
                         '--filtering',
                         help='Keep only one insertion for one poistion',
                         default=False,
                         action='store_true')
    aparser.add_argument('-m',
                         '--markers',
                         help='Custumed set of markers to learn',
                         default=None,
                         type=lambda s: unicode(s, 'utf8'))
    aparser.add_argument('-i',
                         '--infile',
                         help='Input file (.txt)',
                         default=sys.stdin,
                         type=lambda s: unicode(s, 'utf8'))
    aparser.add_argument('-o',
                         '--outfile',
                         help='Output file (.txt)',
                         default=sys.stdout,
                         type=lambda s: unicode(s, 'utf8'))
    aparser.add_argument(
        '-s',
        '--store',
        help=
        'Store evaluation result in file (.csv), effective only in learning mode',
        default=None,
        type=lambda s: unicode(s, 'utf8'))
    args = aparser.parse_args()

    if not (args.learn or args.diacritize or args.undiacritize):
        print 'Error : choose -learn, -diacritize or -undiacritize !'
        aparser.print_help()
        exit(0)

    if args.verbose:
        print 'Arguments received by script'
        dico = vars(args)
        for key, val in dico.items():
            typeName = type(val).__name__
            sys.stdout.write(u"\t{} = {} ".format(key, val))
            if val:
                sys.stdout.write(u"({})".format(typeName))
            print ""

    if args.undiacritize:
        fr = fileReader.fileReader(args.markers)
        fr.read2(args.infile, args.outfile)

    elif args.learn:
        fr = fileReader.fileReader(args.markers)
        allsents = []
        print 'Making observation data from diacritized text'
        for sentence in fr.read(args.infile):
            sent = []
            for token in sentence:
                sent.append((token[0], token[1].encode('utf-8')))
            if len(sent) > 1:
                allsents.append(sent)

        print 'Word segmentation and diacritic informaiotn compression'
        enc = encoder_tones()
        allsents2 = allsents
        allsents = []
        for sent in allsents2:
            sent2 = []
            for token_tags in sent:
                token, tags = token_tags
                [codes,
                 syllabes] = enc.differential_encode(token,
                                                     tags.decode('utf-8'),
                                                     chunkmode)
                token2 = [(syllabe, code.encode('utf-8'))
                          for syllabe, code in zip(syllabes, codes)]
                sent2.append(token2)
            allsents.append(sent2)

        if args.verbose:
            enc.report()

        p = (1 - args.evalsize / 100.0)
        train_set, eval_set = sampling(allsents, p)
        print 'Split the data in train (', len(
            train_set), ' sentences) / test (', len(eval_set), ' sentences)'

        print 'Building classifier (pyCRFsuite)'
        # Initialization
        t1 = time.time()

        # A.1. Initialize a new CRF trainer
        tagger = CRFTagger(verbose=args.verbose,
                           training_opt={'feature.minfreq': 10})
        trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
        trainer.set_params(tagger._training_options)

        # A.2. Prepare training set
        for sent in train_set:
            [tokens, labels] = make_tokens_from_sentence(sent, True)
            features = make_features_from_tokens(tokens, True)
            labels = get_sub_tone_code_of_sentence(sent, sel_en=args.filtering)
            labels = list(itertools.chain(*labels))

            trainer.append(features, labels)
        trainer.train(args.learn.encode('utf-8'))

        print "... done in", get_duration(t1_secs=t1, t2_secs=time.time())

        # B. Evaluation
        print 'Evaluating classifier'
        gold_set = eval_set
        predicted_set_acc = list()

        # B.1. Load trained model
        tagger = CRFTagger(verbose=args.verbose,
                           training_opt={'feature.minfreq': 10})
        trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
        trainer.set_params(tagger._training_options)
        tagger.set_model_file(args.learn.encode('utf-8'))

        # B.2 Tagging segment by segment
        predicted_set = list()
        for p, sent in enumerate(gold_set):

            [tokens, gold_labels] = make_tokens_from_sentence(sent, True)
            features = make_features_from_tokens(tokens, True)
            labels = tagger._tagger.tag(features)
            labels = reshape_tokens_as_sentnece(labels, sent)

            predicted_tokens = list()
            for i, token in enumerate(sent):
                predicted_tokens.append(map(list, zip(tokens[i], labels[i])))
            predicted_set.append(predicted_tokens)

        # B.3 Assemble segements to get annotated token
        if not predicted_set_acc:
            predicted_set_acc = \
             [[[['',''] for syllabe in token] for token in sent] for sent in predicted_set]

        predicted_set_acc = accumulate_tone_code_of_dataset(
            predicted_set_acc, predicted_set)
        predicted_set = predicted_set_acc

        if args.filtering:
            gold_set = apply_filter_to_base_element(gold_set,
                                                    sel_en=args.filtering)

        print "Accuracy : {:>5.3f}".format(
            accuray2(gold_set, predicted_set, True))

        if args.store:
            stored_filename = args.store
            csv_export(stored_filename, gold_set, predicted_set, True)

        if args.verbose and args.store:
            print("Tagged result is exported in {}".format(
                args.store.encode('utf-8')))

    elif args.diacritize and args.infile and args.outfile:

        t1 = time.time()
        # todo : store and load chunkmode value

        # A.1. Load a CRF tagger
        tagger = CRFTagger()
        tagger.set_model_file(args.diacritize.encode('utf-8'))

        # Making observation data from undiacritized text
        fr = fileReader.fileReader(args.markers)
        allsents = []
        print 'Making observation data from diacritized text'

        # non-processed token -> non-processed sentence
        for sentence in fr.read(args.infile):
            sent = []
            for token in sentence:
                sent.append(
                    token[1]
                )  # token[1] : non-processed token from a undiacritized text
            #if len(sent) > 1:
            allsents.append(sent)

        # Word segmentation
        enc = encoder_tones()
        allsents2 = allsents
        allsents = []
        for sent in allsents2:
            sent2 = []
            for token in sent:
                # here, we use encode as a simple chunker to get segment level
                [NONE,
                 chunks] = enc.differential_encode(token, token, chunkmode)
                # put (chunk,chunk) instead of chunk to fit the input format of "make_tokens_from_sentence"
                token2 = [(chunk, chunk) for chunk in chunks]
                sent2.append(token2)
            allsents.append(sent2)

        # A.2 Tagging segment by segment
        predicted_set = list()
        for p, sent in enumerate(allsents):

            [tokens, NONE] = make_tokens_from_sentence(sent, True)
            features = make_features_from_tokens(tokens, True)
            labels = tagger._tagger.tag(features)
            if args.verbose:
                sys.stdout.write(u"{}/{}\n".format(p, len(allsents)))
            labels = reshape_tokens_as_sentnece(labels, sent)

            predicted_tokens = list()
            for i, token in enumerate(sent):
                predicted_tokens.append(map(list, zip(tokens[i], labels[i])))
            predicted_set.append(predicted_tokens)

    # simple raw file writer
        cara_to_ignore = \
                      fr.get_cat_startwith('Zl') + \
                      fr.get_cat_startwith('Zp') + \
                      fr.get_cat_startwith('Zs') + u'\n' + \
                      fr.get_cat_startwith('Pi') + \
                      fr.get_cat_startwith('Pf') + \
                      fr.get_cat_startwith('Po')

        enc = encoder_tones()
        with fileReader.utf8_open(args.outfile, 'w') as fidout:
            for sent in predicted_set:
                for token in sent:
                    form = u''
                    for syllabe in token:
                        #if type(syllabe[0]) == type(cara_to_ignore) :
                        #	print "good syllable type"
                        #else :
                        #	print "bad syllable type"
                        # syllabe[0], syllabe[1] -> token by chunk, label by chunk
                        if syllabe[0] in cara_to_ignore:
                            form += syllabe[0]
                        else:
                            form += enc.differential_decode(
                                syllabe[0], syllabe[1].decode('utf-8'))
                    fidout.write(form)
                #fidout.write('\n')

            print u"... done in", get_duration(t1_secs=t1, t2_secs=time.time())
y = np.array(y)
y_hat = np.array(y_hat)

print("hmm acc : ", (y == y_hat).mean())

#named entities recognition
import pickle

a = pickle.load(
    open(
        "/users/Etu0/3770640/M1/Sem2/TAL/TME1/maxent_ne_chunker/PY3/english_ace_multiclass.pickle",
        "rb"))

from nltk.tag.crf import CRFTagger

tagger = CRFTagger()
tagger.train(alldocs, u'crf.model'
             )  # donner en plus le fichier de stockage du calcul des features

tagger.tag(['Je suis à la maison'])
print(tagger._get_features([u"Je"], 0))

from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger(load=False)
tagger.train(alldocs)

# adT_seq: liste de liste de mots (=liste de phrase)
allpred_smart = [[t for w, t in tagger.tag(adT_seq[i])]
                 for i in range(len(adT_seq))]
allpred_stupid = [[tagger.tag([w])[0][1] for w in adT_seq[i]]
                  for i in range(len(adT_seq))]
Пример #12
0
def main(positive, death):
    ############# Compile the dataset ###############
    ## Load the dataset
    text = list()
    response = list()
    file_path = [positive, death]

    for path in file_path:
        input_file = jsonlines.open(path)
        for obj in input_file:
            text.append(obj['text'])
            response.append(obj['annotation']['part1.Response'])

    ## Tweet Preprocessing
    prep_text = list()
    for i in text:
        prep_text.append(p.clean(i))

    ## Tag Keywords and Create Labels
    ### Focus on verbs--therefore, try lemmatization first
    wnl = WordNetLemmatizer()
    n_corpus = len(prep_text)
    token_data = ["test"] * n_corpus

    n = 0
    for sent in prep_text:
        token_data[n] = [
            wnl.lemmatize(i, j[0].lower())
            if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i)
            for i, j in pos_tag(word_tokenize(sent))
        ]
        n = n + 1

    ### Create labels
    death_list = ["die", "dead", "death", "pass", "away"]

    n = 0
    for sent in token_data:
        for idx, token in enumerate(sent):
            if ((token.lower() in ["test", "positive", "result"])
                    and (response[n] == ["yes"])):
                sent[idx] = [sent[idx], "P-Yes"]
            elif ((token.lower() in ["test", "positive", "result"])
                  and (response[n] == ["no"])):
                sent[idx] = [sent[idx], "P-No"]
            elif ((token.lower() in death_list) and (response[n] == ["yes"])):
                sent[idx] = [sent[idx], "D-Yes"]
            elif ((token.lower() in death_list) and (response[n] == ["no"])):
                sent[idx] = [sent[idx], "D-No"]
            else:
                sent[idx] = [sent[idx], "Irr"]
        n = n + 1

    ## Shuffle and split into train data and dev data
    token_data = shuffle(token_data, random_state=6)
    train_data, dev_data = train_test_split(token_data,
                                            test_size=0.3,
                                            random_state=616)
    print(
        f"The number of sentences in training data: {len(train_data)}; The number of sentences in dev data: {len(dev_data)};"
    )

    ############# Fit A CRF Model And Predict ###############
    condition_to_func = {
        "base": my_features,
        "include_neighbors": neighbor_features
    }
    for cond, func in condition_to_func.items():
        # initialize
        crf = CRFTagger(feature_func=func)
        crf.train(train_data, 'model.tagger')
        # Test
        crf._feature_func(prep_text[0].split(), 7)
        crf.tag_sents([['I', 'get', 'covid'], ['he', 'test', 'positive']])

        # Output
        filename = cond + "_final_output.tsv"
        with open(filename, 'w') as pred_file:
            for sent in dev_data:
                sent_words = [item[0] for item in sent]
                gold_tags = [item[1] for item in sent]

                with_tags = crf.tag(sent_words)
                for i, output in enumerate(with_tags):
                    original_word, tag_prediction = output
                    line_as_str = f"{original_word}\t{gold_tags[i]}\t{tag_prediction}\n"
                    pred_file.write(line_as_str)
                # add an empty line after each sentence
                pred_file.write("\n")

    ############# Evaluation ###############
    ## Extract Data with Meaning Labels
    cond_list = ['base', 'include_neighbors']

    for cond in cond_list:
        filename = cond + "_final_output.tsv"

        with open(filename) as fd:
            rd = csv.reader(fd, delimiter="\t", quotechar='"')
            D_data = []
            P_data = []
            for row in rd:
                if len(row) > 1:
                    if row[1] in ['P-Yes', 'P-No']:
                        P_data.append(row)
                    elif row[1] in ['D-Yes', 'D-No']:
                        D_data.append(row)

        column_name = ['token', 'label', 'prediction']
        P_df = pd.DataFrame(P_data, columns=column_name)
        D_df = pd.DataFrame(D_data, columns=column_name)
        Total_df = P_df.append(D_df)

        # Accuracy
        ## Overall Accuracy
        T_a = accuracy_score(Total_df['label'], Total_df['prediction'])

        ## Accuracy, Precision, and Recall for two events
        accuracy = []
        precision = []
        recall = []
        for df in [P_df, D_df]:
            accuracy.append(accuracy_score(df['label'], df['prediction']))
            precision.append(
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item]
                        and 'Yes' in df['prediction'][item])) /
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['prediction'][item])))
            recall.append(
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item]
                        and 'Yes' in df['prediction'][item])) /
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item])))

        ## F-1
        f1 = []
        for num in [0, 1]:
            f1.append((2 * precision[num] * recall[num]) /
                      (precision[num] + recall[num]))

        # Report performance
        print("condition: " + cond)
        print(f"Overall Accuracy {T_a:0.03}")
        covid_event = ['Test Positive', 'Death Case']

        num = 0
        for event in covid_event:
            print(
                f"Scores for {event} : \taccuracy {accuracy[num]:0.03}\tprecision {precision[num]:0.03}\trecall {recall[num]:0.03}\tF1 {f1[num]:0.03}"
            )
            num = num + 1

    ## Basicline Performance / Confusion Matrix
    print("Confusion Matrix:")
    print(pd.crosstab(Total_df['label'], Total_df['prediction']))
    print("Training data:")
    labels = ["P-Yes", "P-No", "D-Yes", "D-No"]
    for label in labels:
        train_data2 = np.concatenate(train_data).flat
        n_label = sum(1 for item in train_data2 if item == label)
        print(f"Number of {label}: {n_label}")