def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, training_opt): # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags) model_filename = models_folder + "/" + "%i_%s__%s" % ( fold, "power_set", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code( vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags) vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags) wd_td_ys_bytag = dict() wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999))) # documentation: http://www.chokkan.org/software/crfsuite/manual.html model = CRFTagger(feature_func=comp_feat_extactor, verbose=False) model.train(td, model_filename) wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd)) # Delete model file now predictions obtained # Note, we are randomizing name above, so we need to clean up here os.remove(model_filename) td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions) vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def demo(self, test_sents): tagger = CRFTagger(feature_func=self.feature_detector) tagger.set_model_file(self.modelpath) for sent in test_sents: tagged = tagger.tag(untag(sent)) for s in self._to_sentence(tagged): print(s) print(tagger.evaluate(test_sents))
def train(self, load_model=None): train_set = CRF._fin_data_prep(self.train_set) _extract_ftr = self._gen_ftr_func() self.model = CRFTagger(_extract_ftr, verbose=False, training_opt={ "num_memories": 500, "delta": 1e-8 }) self.model.train(train_set, 'stc_crf_model') return self
def pyt_sent_tokenizer(self, paragraph): """단락을 문장으로 바꿔주는 함수입니다. 파이테스트용입니다. Args: paragraph(list(str)): 단락이 리스트 인자로 들어옵니다. Returns: sentences(list(list(str))): 단락을 문장단위로 잘라서 반환합니다. """ tagger = CRFTagger(feature_func=self.feature_detector) tagger.set_model_file(self.modelpath) words = re.split('\s+', paragraph.strip()) tagged = tagger.tag(words) return self._to_sentence(tagged)
def batch_sent_tokenizer(self, paragraphs): """단락들을 문장으로 바꿔주는 함수입니다. Args: paragraphs(list(str)): 단락들이 리스트 인자로 들어옵니다. Returns: sentences(list(str)): 단락을 문장단위로 잘라서 반환합니다. """ tagger = CRFTagger(feature_func=self.feature_detector) tagger.set_model_file(self.modelpath) sentences = [] for paragraph in paragraphs: words = re.split('\s', paragraph.strip()) tagged = tagger.tag(words) sentences.append(self._to_sentence(tagged)) return sentences
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq) model_filename = models_folder + "/" + "%i_%s__%s" % ( fold, "most_freq_code", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code( vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def main(): aparser = argparse.ArgumentParser(description='Daba disambiguator') aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true') aparser.add_argument( '-l', '--learn', help='Learn model from data (and save as F if provided)', default=None) aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true') aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true') aparser.add_argument('-r', '--root', help='Corpus root dir') aparser.add_argument('-f', '--filelist', help='Path to a list of files to learn from') # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true') aparser.add_argument( '-e', '--evalsize', type=int, default=10, help= 'Percent of training data with respect to training and test one (default 10)' ) aparser.add_argument( '-d', '--disambiguate', help= 'Use model F to disambiguate data, the gloss list will be ordered by the probability growth order', default=None) aparser.add_argument( '--select', help= 'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.', action='store_true') aparser.add_argument('-i', '--infile', help='Input file (.html)', default=sys.stdin) aparser.add_argument('-o', '--outfile', help='Output file (.html)', default=sys.stdout) aparser.add_argument( '-s', '--store', help= 'Store tagged raw data in file (.csv) for further research purpose', default=None) args = aparser.parse_args() if args.verbose: print(args) if args.learn and (args.pos or args.tone or args.gloss): if not (args.pos or args.tone or args.gloss): print('Choose pos, tone, gloss or combination of them') exit(0) print('Make list of files') allfiles = [] with codecs.open(args.filelist, 'r', encoding="utf-8") as filelist: for line in filelist: allfiles.append(line.strip()) allsents = [] # pour le débogage # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html' if args.tone: try: enc = encoder_tones() except: enc = None print(("Error : unable to initialize the tone encoder !")) print('Open files and find features / supervision tags') for infile in allfiles: if (infile): print('-', infile) sent = [] html_parser = FileParser() html_parser.read_file(os.path.join(args.root, infile)) for snum, sentence in enumerate(html_parser.glosses): for tnum, token in enumerate(sentence[2]): tag = '' if token.type == 'w' or token.type == 'c': tags = '' if args.pos: tags = '/'.join(token.gloss.ps) wordform = detone(token.gloss.form) sent.append((wordform, tags)) elif args.tone: # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ? # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter # une réélle amélioration dans la modélisation de tonalisation. Néanmoins, # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans # les données d'entraînement et d'en observer le apport if '|' not in token.gloss.form: [codes, chunks] = enc.differential_encode( token.token, token.gloss.form) for chunk, code in zip(chunks, codes): try: sent.append((chunk, code)) except LookupError: pass """ elif args.gloss: tags += token.gloss.gloss sent.append((token.token, tags)) """ if len(sent) > 1: allsents.append(sent) sent = [] if args.verbose and args.tone: enc.report() # Constitution des ensmebles d'entraînement de d'évaluation p = (1 - args.evalsize / 100.0) train_set, eval_set = sampling(allsents, p) print('Split the data in train (', len(train_set), ' sentences) / test (', len(eval_set), ' sentences)') print('Building classifier (CRF/NLTK)') # Initialization t1 = time.time() if args.tone: num_phases = len([False, True]) * len(mode_indicators) myzip = zipfile.ZipFile(args.learn + '.zip', 'w') else: num_phases = 1 # Training for phase in range(num_phases): tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq': 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) if num_phases > 1: model_name = args.learn + '.' + str(phase) else: model_name = args.learn # train_set : list(list((str,list(str)))) for sent in train_set: tokens = unzip(sent)[0] labels = unzip(sent)[1] if num_phases > 1: for lab in labels: pass labels = [ code_dispatcher(label)[phase] for label in labels ] features = [ _get_features_customised_for_tones(tokens, i) for i in range(len(tokens)) ] trainer.append(features, labels) trainer.train(model=model_name) if num_phases > 1: myzip.write(model_name) os.remove(model_name) if num_phases > 1: myzip.close() print("... done in", get_duration(t1_secs=t1, t2_secs=time.time())) # Evaluation print('Evaluating classifier') # gold_set, predicted_set : list(list((str, str))) # input_set, output_gold_set : list(list(str)) gold_set = eval_set input_set = [unzip(sent)[0] for sent in gold_set] predicted_set = [list() for sent in gold_set] if num_phases > 1: myzip = zipfile.ZipFile(args.learn + '.zip', 'r') for phase in range(num_phases): tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq': 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) if num_phases > 1: model_name = args.learn + '.' + str(phase) myzip.extract(model_name) else: model_name = args.learn tagger.set_model_file(model_name) for i, sent in enumerate(input_set): features = [ _get_features_customised_for_tones(sent, j) for j in range(len(sent)) ] labels = tagger._tagger.tag(features) if num_phases > 1: labels = [ code_dispatcher(label)[phase] for label in labels ] tagged_sent = list(zip(sent, labels)) if not predicted_set[i]: predicted_set[i] = tagged_sent else: sent_acc, labels_acc = unzip(predicted_set[i]) labels_acc = [ label_acc + label for label_acc, label in zip(labels_acc, labels) ] predicted_set[i] = list(zip(sent_acc, labels_acc)) if num_phases > 1: os.remove(model_name) myzip.close() # gold_tokens, predicted_tokens : list((str,str)) predicted_tokens = list(itertools.chain(*predicted_set)) if num_phases > 1: predicted_tokens = [ tuple([pair[0], code_resort(pair[1])]) for pair in predicted_tokens ] gold_tokens = list(itertools.chain(*gold_set)) # gold_tokens_eval, predicted_tokens_eval : list(str) if args.tone: gold_tokens_eval = getTag(gold_tokens) predicted_tokens_eval = getTag(predicted_tokens) else: gold_tokens_eval = gold_tokens predicted_tokens_eval = predicted_tokens if args.store and args.tone: stored_filename = args.store csv_export(enc, stored_filename, gold_tokens, predicted_tokens) print("Accuracy : {:>5.3f}".format( accuracy(gold_tokens_eval, predicted_tokens_eval))) if args.verbose and args.store: print(("Tagged result is exported in {}".format(args.store))) elif args.disambiguate and args.infile and args.outfile: # Lecture de texte en .HTML html_parser = FileParser() tagger = CRFTagger() if args.pos: try: tagger.set_model_file(args.disambiguate) except IOError: print("Error : unable to open the model {} !".format( args.infile)) exit(1) try: html_parser.read_file(args.infile) except IOError: print("Error : unable to open the input file {} !".format( args.infile)) exit(1) # Exportation du résultat de désambiguïsation en .HTML for snum, sentence in enumerate(html_parser.glosses): tokens = [token.token for token in sentence[2]] features = [ _get_features_customised_for_tones(tokens, i) for i in range(len(tokens)) ] tagger._tagger.set(features) for tnum, token in enumerate(sentence[2]): options = list() if token.value and len(token.value) > 2: for nopt, option in enumerate(token.value[2]): try: tag = option.ps[0] except IndexError: tag = '' prob = tagger._tagger.marginal(tag, tnum) options.append((prob, option)) reordered_probs, reordered_options = unzip( sorted(options, reverse=True)) if args.select: prob_max = reordered_probs[0] reordered_options = tuple([ reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max ]) html_parser.glosses[snum][1][tnum] = reordered_options elif args.tone: pass try: html_parser.write(args.outfile) except IOError: print("Error : unable to create the output file {}".format( args.outfile)) else: aparser.print_help() exit(0)
def main(): aparser = argparse.ArgumentParser(description='Daba disambiguator') # aparser.add_argument('-i', '--infile', help='Input file (.html)', default="sys.stdin") # aparser.add_argument('-o', '--outfile', help='Output file (.html)', default="sys.stdout") aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None) aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true') aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true') aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true') aparser.add_argument('-e', '--evalsize', help='Percent of randomized data to use for evaluation (default 10)', default=10) aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true') args = aparser.parse_args() if args.learn: if not args.pos or args.tone or args.gloss: print 'Choose pos, tone, gloss or combination of them' exit(0) print 'Make list of files' files1 = glob.iglob("../corbama/*/*.dis.html") files2 = glob.iglob("../corbama/*.dis.html") allfiles = "" for file1, file2 in zip(files1, files2): allfiles += file1+','+file2+',' allsents = [] print 'Open files and find features / supervision tags' for infile in allfiles.split(','): if(len(infile)) : print '-', infile sent = [] in_handler = formats.HtmlReader(infile, compatibility_mode=False) for token in in_handler: tag = '' if token.type == 'w' or token.type == 'c': tags = '' if args.pos: for ps in token.gloss.ps: tags += ps if args.tone: tags += token.gloss.form.encode('utf-8') if args.gloss: tags += token.gloss.gloss.encode('utf-8') sent.append((token.token, tags)) if token.type == 'c' and token.token in ['.', '?', '!']: if len(sent) > 1: allsents.append(sent) sent = [] datalength = len(allsents) p = (1-args.evalsize/100.0) print 'Randomize and split the data in train (', int(p*datalength),' sentences) / test (', int(datalength-p*datalength),' sentences)' random.seed(123456) random.shuffle(allsents) train_set = allsents[:int(p*datalength)] test_set = allsents[int(p*datalength):datalength] print 'Building classifier (CRF/NLTK)' tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10}) t1 = time.time() tagger.train(train_set, args.learn) t2 = time.time() texec = t2-t1 print "... done in", time.strftime('%H %M %S', time.localtime(texec)) print 'Evaluating classifier' print tagger.evaluate(test_set) if args.verbose: print 'Compute detailed output' else: print 'USE...' parser.print_help() exit(0)
def main(): aparser = argparse.ArgumentParser( description=u'Tonalizer - CRF-based Tone Reconstitution Tool') aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true') aparser.add_argument( '-l', '--learn', help='Learn model from diacritized text (and save as file if provided)', default=None, type=lambda s: unicode(s, 'utf8')) aparser.add_argument( '-e', '--evalsize', help= 'Percent of training data with respect to training and test one (default 10)', default=10, type=float) #aparser.add_argument('-c', '--chunkmode', help='Word segmentation width (default 3)', default=3, type=int) aparser.add_argument('-d', '--diacritize', help='Use model file to diacritize a raw text', default=None) aparser.add_argument('-u', '--undiacritize', help='Undiacritize a raw text', default=False, action='store_true') aparser.add_argument('-f', '--filtering', help='Keep only one insertion for one poistion', default=False, action='store_true') aparser.add_argument('-m', '--markers', help='Custumed set of markers to learn', default=None, type=lambda s: unicode(s, 'utf8')) aparser.add_argument('-i', '--infile', help='Input file (.txt)', default=sys.stdin, type=lambda s: unicode(s, 'utf8')) aparser.add_argument('-o', '--outfile', help='Output file (.txt)', default=sys.stdout, type=lambda s: unicode(s, 'utf8')) aparser.add_argument( '-s', '--store', help= 'Store evaluation result in file (.csv), effective only in learning mode', default=None, type=lambda s: unicode(s, 'utf8')) args = aparser.parse_args() if not (args.learn or args.diacritize or args.undiacritize): print 'Error : choose -learn, -diacritize or -undiacritize !' aparser.print_help() exit(0) if args.verbose: print 'Arguments received by script' dico = vars(args) for key, val in dico.items(): typeName = type(val).__name__ sys.stdout.write(u"\t{} = {} ".format(key, val)) if val: sys.stdout.write(u"({})".format(typeName)) print "" if args.undiacritize: fr = fileReader.fileReader(args.markers) fr.read2(args.infile, args.outfile) elif args.learn: fr = fileReader.fileReader(args.markers) allsents = [] print 'Making observation data from diacritized text' for sentence in fr.read(args.infile): sent = [] for token in sentence: sent.append((token[0], token[1].encode('utf-8'))) if len(sent) > 1: allsents.append(sent) print 'Word segmentation and diacritic informaiotn compression' enc = encoder_tones() allsents2 = allsents allsents = [] for sent in allsents2: sent2 = [] for token_tags in sent: token, tags = token_tags [codes, syllabes] = enc.differential_encode(token, tags.decode('utf-8'), chunkmode) token2 = [(syllabe, code.encode('utf-8')) for syllabe, code in zip(syllabes, codes)] sent2.append(token2) allsents.append(sent2) if args.verbose: enc.report() p = (1 - args.evalsize / 100.0) train_set, eval_set = sampling(allsents, p) print 'Split the data in train (', len( train_set), ' sentences) / test (', len(eval_set), ' sentences)' print 'Building classifier (pyCRFsuite)' # Initialization t1 = time.time() # A.1. Initialize a new CRF trainer tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq': 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) # A.2. Prepare training set for sent in train_set: [tokens, labels] = make_tokens_from_sentence(sent, True) features = make_features_from_tokens(tokens, True) labels = get_sub_tone_code_of_sentence(sent, sel_en=args.filtering) labels = list(itertools.chain(*labels)) trainer.append(features, labels) trainer.train(args.learn.encode('utf-8')) print "... done in", get_duration(t1_secs=t1, t2_secs=time.time()) # B. Evaluation print 'Evaluating classifier' gold_set = eval_set predicted_set_acc = list() # B.1. Load trained model tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq': 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) tagger.set_model_file(args.learn.encode('utf-8')) # B.2 Tagging segment by segment predicted_set = list() for p, sent in enumerate(gold_set): [tokens, gold_labels] = make_tokens_from_sentence(sent, True) features = make_features_from_tokens(tokens, True) labels = tagger._tagger.tag(features) labels = reshape_tokens_as_sentnece(labels, sent) predicted_tokens = list() for i, token in enumerate(sent): predicted_tokens.append(map(list, zip(tokens[i], labels[i]))) predicted_set.append(predicted_tokens) # B.3 Assemble segements to get annotated token if not predicted_set_acc: predicted_set_acc = \ [[[['',''] for syllabe in token] for token in sent] for sent in predicted_set] predicted_set_acc = accumulate_tone_code_of_dataset( predicted_set_acc, predicted_set) predicted_set = predicted_set_acc if args.filtering: gold_set = apply_filter_to_base_element(gold_set, sel_en=args.filtering) print "Accuracy : {:>5.3f}".format( accuray2(gold_set, predicted_set, True)) if args.store: stored_filename = args.store csv_export(stored_filename, gold_set, predicted_set, True) if args.verbose and args.store: print("Tagged result is exported in {}".format( args.store.encode('utf-8'))) elif args.diacritize and args.infile and args.outfile: t1 = time.time() # todo : store and load chunkmode value # A.1. Load a CRF tagger tagger = CRFTagger() tagger.set_model_file(args.diacritize.encode('utf-8')) # Making observation data from undiacritized text fr = fileReader.fileReader(args.markers) allsents = [] print 'Making observation data from diacritized text' # non-processed token -> non-processed sentence for sentence in fr.read(args.infile): sent = [] for token in sentence: sent.append( token[1] ) # token[1] : non-processed token from a undiacritized text #if len(sent) > 1: allsents.append(sent) # Word segmentation enc = encoder_tones() allsents2 = allsents allsents = [] for sent in allsents2: sent2 = [] for token in sent: # here, we use encode as a simple chunker to get segment level [NONE, chunks] = enc.differential_encode(token, token, chunkmode) # put (chunk,chunk) instead of chunk to fit the input format of "make_tokens_from_sentence" token2 = [(chunk, chunk) for chunk in chunks] sent2.append(token2) allsents.append(sent2) # A.2 Tagging segment by segment predicted_set = list() for p, sent in enumerate(allsents): [tokens, NONE] = make_tokens_from_sentence(sent, True) features = make_features_from_tokens(tokens, True) labels = tagger._tagger.tag(features) if args.verbose: sys.stdout.write(u"{}/{}\n".format(p, len(allsents))) labels = reshape_tokens_as_sentnece(labels, sent) predicted_tokens = list() for i, token in enumerate(sent): predicted_tokens.append(map(list, zip(tokens[i], labels[i]))) predicted_set.append(predicted_tokens) # simple raw file writer cara_to_ignore = \ fr.get_cat_startwith('Zl') + \ fr.get_cat_startwith('Zp') + \ fr.get_cat_startwith('Zs') + u'\n' + \ fr.get_cat_startwith('Pi') + \ fr.get_cat_startwith('Pf') + \ fr.get_cat_startwith('Po') enc = encoder_tones() with fileReader.utf8_open(args.outfile, 'w') as fidout: for sent in predicted_set: for token in sent: form = u'' for syllabe in token: #if type(syllabe[0]) == type(cara_to_ignore) : # print "good syllable type" #else : # print "bad syllable type" # syllabe[0], syllabe[1] -> token by chunk, label by chunk if syllabe[0] in cara_to_ignore: form += syllabe[0] else: form += enc.differential_decode( syllabe[0], syllabe[1].decode('utf-8')) fidout.write(form) #fidout.write('\n') print u"... done in", get_duration(t1_secs=t1, t2_secs=time.time())
y = np.array(y) y_hat = np.array(y_hat) print("hmm acc : ", (y == y_hat).mean()) #named entities recognition import pickle a = pickle.load( open( "/users/Etu0/3770640/M1/Sem2/TAL/TME1/maxent_ne_chunker/PY3/english_ace_multiclass.pickle", "rb")) from nltk.tag.crf import CRFTagger tagger = CRFTagger() tagger.train(alldocs, u'crf.model' ) # donner en plus le fichier de stockage du calcul des features tagger.tag(['Je suis à la maison']) print(tagger._get_features([u"Je"], 0)) from nltk.tag.perceptron import PerceptronTagger tagger = PerceptronTagger(load=False) tagger.train(alldocs) # adT_seq: liste de liste de mots (=liste de phrase) allpred_smart = [[t for w, t in tagger.tag(adT_seq[i])] for i in range(len(adT_seq))] allpred_stupid = [[tagger.tag([w])[0][1] for w in adT_seq[i]] for i in range(len(adT_seq))]
def main(positive, death): ############# Compile the dataset ############### ## Load the dataset text = list() response = list() file_path = [positive, death] for path in file_path: input_file = jsonlines.open(path) for obj in input_file: text.append(obj['text']) response.append(obj['annotation']['part1.Response']) ## Tweet Preprocessing prep_text = list() for i in text: prep_text.append(p.clean(i)) ## Tag Keywords and Create Labels ### Focus on verbs--therefore, try lemmatization first wnl = WordNetLemmatizer() n_corpus = len(prep_text) token_data = ["test"] * n_corpus n = 0 for sent in prep_text: token_data[n] = [ wnl.lemmatize(i, j[0].lower()) if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i) for i, j in pos_tag(word_tokenize(sent)) ] n = n + 1 ### Create labels death_list = ["die", "dead", "death", "pass", "away"] n = 0 for sent in token_data: for idx, token in enumerate(sent): if ((token.lower() in ["test", "positive", "result"]) and (response[n] == ["yes"])): sent[idx] = [sent[idx], "P-Yes"] elif ((token.lower() in ["test", "positive", "result"]) and (response[n] == ["no"])): sent[idx] = [sent[idx], "P-No"] elif ((token.lower() in death_list) and (response[n] == ["yes"])): sent[idx] = [sent[idx], "D-Yes"] elif ((token.lower() in death_list) and (response[n] == ["no"])): sent[idx] = [sent[idx], "D-No"] else: sent[idx] = [sent[idx], "Irr"] n = n + 1 ## Shuffle and split into train data and dev data token_data = shuffle(token_data, random_state=6) train_data, dev_data = train_test_split(token_data, test_size=0.3, random_state=616) print( f"The number of sentences in training data: {len(train_data)}; The number of sentences in dev data: {len(dev_data)};" ) ############# Fit A CRF Model And Predict ############### condition_to_func = { "base": my_features, "include_neighbors": neighbor_features } for cond, func in condition_to_func.items(): # initialize crf = CRFTagger(feature_func=func) crf.train(train_data, 'model.tagger') # Test crf._feature_func(prep_text[0].split(), 7) crf.tag_sents([['I', 'get', 'covid'], ['he', 'test', 'positive']]) # Output filename = cond + "_final_output.tsv" with open(filename, 'w') as pred_file: for sent in dev_data: sent_words = [item[0] for item in sent] gold_tags = [item[1] for item in sent] with_tags = crf.tag(sent_words) for i, output in enumerate(with_tags): original_word, tag_prediction = output line_as_str = f"{original_word}\t{gold_tags[i]}\t{tag_prediction}\n" pred_file.write(line_as_str) # add an empty line after each sentence pred_file.write("\n") ############# Evaluation ############### ## Extract Data with Meaning Labels cond_list = ['base', 'include_neighbors'] for cond in cond_list: filename = cond + "_final_output.tsv" with open(filename) as fd: rd = csv.reader(fd, delimiter="\t", quotechar='"') D_data = [] P_data = [] for row in rd: if len(row) > 1: if row[1] in ['P-Yes', 'P-No']: P_data.append(row) elif row[1] in ['D-Yes', 'D-No']: D_data.append(row) column_name = ['token', 'label', 'prediction'] P_df = pd.DataFrame(P_data, columns=column_name) D_df = pd.DataFrame(D_data, columns=column_name) Total_df = P_df.append(D_df) # Accuracy ## Overall Accuracy T_a = accuracy_score(Total_df['label'], Total_df['prediction']) ## Accuracy, Precision, and Recall for two events accuracy = [] precision = [] recall = [] for df in [P_df, D_df]: accuracy.append(accuracy_score(df['label'], df['prediction'])) precision.append( sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item] and 'Yes' in df['prediction'][item])) / sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['prediction'][item]))) recall.append( sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item] and 'Yes' in df['prediction'][item])) / sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item]))) ## F-1 f1 = [] for num in [0, 1]: f1.append((2 * precision[num] * recall[num]) / (precision[num] + recall[num])) # Report performance print("condition: " + cond) print(f"Overall Accuracy {T_a:0.03}") covid_event = ['Test Positive', 'Death Case'] num = 0 for event in covid_event: print( f"Scores for {event} : \taccuracy {accuracy[num]:0.03}\tprecision {precision[num]:0.03}\trecall {recall[num]:0.03}\tF1 {f1[num]:0.03}" ) num = num + 1 ## Basicline Performance / Confusion Matrix print("Confusion Matrix:") print(pd.crosstab(Total_df['label'], Total_df['prediction'])) print("Training data:") labels = ["P-Yes", "P-No", "D-Yes", "D-No"] for label in labels: train_data2 = np.concatenate(train_data).flat n_label = sum(1 for item in train_data2 if item == label) print(f"Number of {label}: {n_label}")