def write_anotations_to_file(lst_annotation, file_name): with codecs.open(file_name, 'w', 'utf-8') as f: for annotation in lst_annotation: annotation_full_text = annotation.text car_name = preprocessor_text(annotation.name) annotation_start = annotation_full_text.find(car_name) annotation_end = annotation.start + len(car_name) full_text_before_annotation = preprocessor_text(annotation_full_text[:annotation_start].strip()) before_tokens = word_tokenize(full_text_before_annotation) for token in before_tokens: f.write( token + u' ' + u'O' + u'\n' ) annotation_tokens = word_tokenize(car_name) for idx, token in enumerate(annotation_tokens): if idx == 0: label = u'B' else: label = u'I' f.write( token + u' ' + label + u'\n' ) full_text_after_annotation = preprocessor_text(annotation_full_text[annotation_end:]).strip() after_tokens = word_tokenize(full_text_after_annotation) for token in after_tokens: f.write( token + u' ' + u'O' + '\n' ) f.write( u'\n' )
def write_anotations_to_file(lst_annotation, file_name): with codecs.open(file_name, 'w', 'utf-8') as f: for annotation in lst_annotation: annotation_full_text = annotation.text car_name = preprocessor_text(annotation.name) annotation_start = annotation_full_text.find(car_name) annotation_end = annotation.start + len(car_name) full_text_before_annotation = preprocessor_text( annotation_full_text[:annotation_start].strip()) before_tokens = word_tokenize(full_text_before_annotation) for token in before_tokens: f.write(token + u' ' + u'O' + u'\n') annotation_tokens = word_tokenize(car_name) for idx, token in enumerate(annotation_tokens): if idx == 0: label = u'B' else: label = u'I' f.write(token + u' ' + label + u'\n') full_text_after_annotation = preprocessor_text( annotation_full_text[annotation_end:]).strip() after_tokens = word_tokenize(full_text_after_annotation) for token in after_tokens: f.write(token + u' ' + u'O' + '\n') f.write(u'\n')
def predict_cars(clf, sentence): test_f_name = os.path.join( current_dir, './../data/test_ann' ) sentence = preprocessor_text( sentence ) tokens = word_tokenize(sentence) with codecs.open(test_f_name, 'w', 'utf-8') as f: for t in tokens: f.write(t + u' ' + u'O' + u'\n') f.flush() X, y, lengths= load_conll(test_f_name, features) y_pred = clf.predict(X, lengths) found_cars = [] current_car = [] found_car = False for idx, token in enumerate( y_pred ): t = str(token) if t == 'B': current_car.append(tokens[idx]) found_car = True elif t == 'I': current_car.append(tokens[idx]) else: if found_car: found_car = False found_cars.append( u' '.join(current_car) ) current_car = [] if len(current_car) > 0: found_cars.append( u' '.join(current_car) ) return found_cars
def predict_cars(clf, sentence): test_f_name = os.path.join(current_dir, './../data/test_ann') sentence = preprocessor_text(sentence) tokens = word_tokenize(sentence) with codecs.open(test_f_name, 'w', 'utf-8') as f: for t in tokens: f.write(t + u' ' + u'O' + u'\n') f.flush() X, y, lengths = load_conll(test_f_name, features) y_pred = clf.predict(X, lengths) found_cars = [] current_car = [] found_car = False for idx, token in enumerate(y_pred): t = str(token) if t == 'B': current_car.append(tokens[idx]) found_car = True elif t == 'I': current_car.append(tokens[idx]) else: if found_car: found_car = False found_cars.append(u' '.join(current_car)) current_car = [] if len(current_car) > 0: found_cars.append(u' '.join(current_car)) return found_cars