Пример #1
0
def write_anotations_to_file(lst_annotation, file_name):

    with codecs.open(file_name, 'w', 'utf-8') as f:
        for annotation in lst_annotation:
            annotation_full_text = annotation.text

            car_name = preprocessor_text(annotation.name)

            annotation_start = annotation_full_text.find(car_name)
            annotation_end = annotation.start + len(car_name)

            full_text_before_annotation = preprocessor_text(annotation_full_text[:annotation_start].strip())

            before_tokens = word_tokenize(full_text_before_annotation)

            for token in before_tokens:
                f.write( token + u' ' + u'O' + u'\n' )

            annotation_tokens = word_tokenize(car_name)
            for idx, token in enumerate(annotation_tokens):
                if idx == 0:
                    label = u'B'
                else:
                    label = u'I'
                f.write( token + u' ' + label + u'\n' )

            full_text_after_annotation =  preprocessor_text(annotation_full_text[annotation_end:]).strip()

            after_tokens = word_tokenize(full_text_after_annotation)

            for token in after_tokens:
                f.write( token + u' ' + u'O' + '\n' )
            f.write( u'\n' )
Пример #2
0
def write_anotations_to_file(lst_annotation, file_name):

    with codecs.open(file_name, 'w', 'utf-8') as f:
        for annotation in lst_annotation:
            annotation_full_text = annotation.text

            car_name = preprocessor_text(annotation.name)

            annotation_start = annotation_full_text.find(car_name)
            annotation_end = annotation.start + len(car_name)

            full_text_before_annotation = preprocessor_text(
                annotation_full_text[:annotation_start].strip())

            before_tokens = word_tokenize(full_text_before_annotation)

            for token in before_tokens:
                f.write(token + u' ' + u'O' + u'\n')

            annotation_tokens = word_tokenize(car_name)
            for idx, token in enumerate(annotation_tokens):
                if idx == 0:
                    label = u'B'
                else:
                    label = u'I'
                f.write(token + u' ' + label + u'\n')

            full_text_after_annotation = preprocessor_text(
                annotation_full_text[annotation_end:]).strip()

            after_tokens = word_tokenize(full_text_after_annotation)

            for token in after_tokens:
                f.write(token + u' ' + u'O' + '\n')
            f.write(u'\n')
Пример #3
0
def predict_cars(clf, sentence):
    test_f_name = os.path.join( current_dir, './../data/test_ann' )

    sentence = preprocessor_text( sentence )
    tokens = word_tokenize(sentence)
    with codecs.open(test_f_name, 'w', 'utf-8') as f:

        for t in tokens:
            f.write(t + u' ' + u'O' + u'\n')
        f.flush()

    X, y, lengths= load_conll(test_f_name, features)
    y_pred = clf.predict(X, lengths)

    found_cars = []

    current_car = []
    found_car = False
    for idx, token in enumerate( y_pred ):
        t = str(token)
        if t == 'B':

            current_car.append(tokens[idx])
            found_car = True
        elif t == 'I':
            current_car.append(tokens[idx])
        else:
            if found_car:
                found_car = False
                found_cars.append( u' '.join(current_car) )
                current_car = []
    if len(current_car) > 0:
        found_cars.append( u' '.join(current_car) )
    return found_cars
Пример #4
0
def predict_cars(clf, sentence):
    test_f_name = os.path.join(current_dir, './../data/test_ann')

    sentence = preprocessor_text(sentence)
    tokens = word_tokenize(sentence)
    with codecs.open(test_f_name, 'w', 'utf-8') as f:

        for t in tokens:
            f.write(t + u' ' + u'O' + u'\n')
        f.flush()

    X, y, lengths = load_conll(test_f_name, features)
    y_pred = clf.predict(X, lengths)

    found_cars = []

    current_car = []
    found_car = False
    for idx, token in enumerate(y_pred):
        t = str(token)
        if t == 'B':

            current_car.append(tokens[idx])
            found_car = True
        elif t == 'I':
            current_car.append(tokens[idx])
        else:
            if found_car:
                found_car = False
                found_cars.append(u' '.join(current_car))
                current_car = []
    if len(current_car) > 0:
        found_cars.append(u' '.join(current_car))
    return found_cars