示例#1
0
def train_xgboost_lr(data_path,
                     vectorizer_path=None,
                     xgblr_xgb_model_path=None,
                     xgblr_lr_model_path=None,
                     feature_encoder_path=None,
                     feature_type='tfidf_char',
                     col_sep='\t'):
    data_content, data_lbl = data_reader(data_path, col_sep)
    # init feature
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=vectorizer_path)
    # get data feature
    data_feature = feature.get_feature()
    # label
    data_label = feature.label_encoder(data_lbl)
    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    model = XGBLR(xgblr_xgb_model_path, xgblr_lr_model_path,
                  feature_encoder_path)
    # fit
    model.train_model(X_train, y_train)
    # evaluate
    label_pred = model.predict(X_val)
    simple_evaluate(y_val, label_pred)
示例#2
0
def infer_classic(model_save_path,
                  test_data_path,
                  thresholds=0.5,
                  pred_save_path=None,
                  vectorizer_path=None,
                  col_sep=',',
                  num_classes=2,
                  feature_type='tf'):
    # load model
    model = load_pkl(model_save_path)
    # load data content
    data_set, test_ids = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data_set,
                      feature_type=feature_type,
                      feature_vec_path=vectorizer_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()

    if num_classes == 2:
        # binary classification
        label_pred_probas = model.predict_proba(data_feature)[:, 1]
        label_pred = label_pred_probas > thresholds
    else:
        label_pred = model.predict(data_feature)
    save(label_pred, test_ids, pred_save_path)
    print("finish prediction.")
示例#3
0
def infer_classic(model_type='xgboost_lr',
                  model_save_path='',
                  label_vocab_path='',
                  test_data_path='',
                  pred_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word'):
    # load data content
    data_set, true_labels = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data=data_set,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()
    # load model
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path)
    else:
        model = load_pkl(model_save_path)

    # predict
    pred_label_probs = model.predict_proba(data_feature)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}

    pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to:%s" % pred_save_path)
    save_predict_result(pred_output,
                        ture_labels=None,
                        pred_save_path=pred_save_path,
                        data_set=data_set)

    # evaluate
    if true_labels:
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))
        except Exception:
            print("error. no true labels")

    # analysis lr model
    if config.debug and model_type == "logistic_regression":
        feature_weight_dict = load_dict(config.lr_feature_weight_path)
        pred_labels = cal_multiclass_lr_predict(data_set, feature_weight_dict,
                                                id_label)
        print(pred_labels[:5])
示例#4
0
def __find_candidates_to_anaphora_in_chapter(chapter, total_words, candidates,
                                             stop_word_check):
    """ Finds candidates to anaphora in the specified chapter"""
    word_count = 0
    i = 0
    while i < len(chapter.sentences):
        if len(chapter.sentences[i]) > 1:
            candidate = Feature('anaphora')
            candidate.add_word(total_words + word_count)
            candidate.add_context(
                total_words + word_count,
                total_words + word_count + len(chapter.sentences[i]) - 1)
            word_count += len(chapter.sentences[i])
            first_anaphora_word = chapter.sentences[i][0].lower()
            i += 1
            if stop_word_check(first_anaphora_word):
                continue
            context_length = 1
            while i < len(chapter.sentences) and chapter.sentences[i][0].lower(
            ) == first_anaphora_word:
                candidate.extend_context(total_words + word_count +
                                         len(chapter.sentences[i]) - 1)
                candidate.add_word(total_words + word_count)
                word_count += len(chapter.sentences[i])
                i += 1
                context_length += 1
            if context_length > 1:
                if candidate not in candidates:
                    candidates.append(candidate)
        else:
            if len(chapter.sentences[i]) > 0 and chapter.sentences[i][0]:
                word_count += len(chapter.sentences[i])
            i += 1
    return word_count
示例#5
0
def infer_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     label_vocab_path='',
                     max_len=300,
                     batch_size=128,
                     col_sep='\t',
                     pred_save_path=None):
    from keras.models import load_model
    # load data content
    data_set, true_labels = data_reader(data_path, col_sep)
    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        feature_type = 'doc_vectorize'
    else:
        feature_type = 'vectorize'
    feature = Feature(data_set,
                      feature_type=feature_type,
                      is_infer=True,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    # load model
    model = load_model(model_save_path)
    # predict, in keras, predict_proba same with predict
    pred_label_probs = model.predict(data_feature, batch_size=batch_size)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}
    pred_labels = [prob.argmax() for prob in pred_label_probs]
    pred_labels = [id_label[i] for i in pred_labels]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to: %s" % pred_save_path)
    save_predict_result(pred_output,
                        ture_labels=None,
                        pred_save_path=pred_save_path,
                        data_set=data_set)
    if true_labels:
        # evaluate
        assert len(pred_labels) == len(true_labels)
        for label, prob in zip(true_labels, pred_label_probs):
            logger.debug('label_true:%s\tprob_label:%s\tprob:%s' %
                         (label, id_label[prob.argmax()], prob.max()))

        print('total eval:')
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))
示例#6
0
 def _get_feature(self, word_vocab):
     # 提取特征
     print("feature_type : %s" % self.feature_type)
     print("seg_contents:")
     print(self.seg_contents[:2])
     feature = Feature(data=self.seg_contents,
                       feature_type=self.feature_type,
                       feature_vec_path=self.feature_vec_path,
                       word_vocab=word_vocab)
     # get data feature
     return feature.get_feature()
示例#7
0
    def test_returns_none_if_database_and_enum_are_consistent(self, app):
        # Given
        find_all_features = MagicMock()
        features = []
        for feature_toggle in FeatureToggle:
            feature = Feature()
            feature.populate_from_dict({'name': feature_toggle})
            features.append(feature)
        find_all_features.return_value = features

        # When / Then
        assert check_feature_consistency(find_all_features) is None
示例#8
0
def install_features():
    Feature.query.delete()
    features = []
    for toggle in FeatureToggle:
        feature = Feature()
        feature.populate_from_dict({
            'description': toggle.value,
            'name': toggle,
            'is_active': True
        })
        features.append(feature)
    PcObject.save(*features)
def context_intersection(first: Feature, second: Feature) -> list:
    """
    Returns context intersection between two features

    :param first: first feature
    :param second: second feature
    :return: context intersection
    """
    first_beg, first_end = first.context_begin(), first.context_end()
    second_beg, second_end = second.context_begin(), second.context_end()
    if first_beg > second_end or second_beg > first_end:
        return []
    return [max(first_beg, second_beg), min(first_end, second_end)]
示例#10
0
def find(anaphoras: list, epiphoras: list) -> list:
    """
    Finds symploce between previously found anaphoras and epiphoras

    :param anaphoras: list with anaphoras
    :param epiphoras: list with epiphoras
    :return: list with symploces (Feature objects)
    """
    res = list()
    for anaphora in anaphoras:
        for epiphora in epiphoras:
            intersection = context_intersection(anaphora, epiphora)
            if intersection:
                inter_start, inter_end = intersection
                anaphora_words = [
                    word for word in anaphora.words()
                    if inter_start <= word <= inter_end
                ]
                epiphora_words = [
                    word for word in epiphora.words()
                    if inter_start <= word <= inter_end
                ]
                if anaphora_words and epiphora_words:
                    if anaphora_words[-1] > epiphora_words[0]:
                        res.append(
                            Feature("symploce",
                                    words=sorted(anaphora_words +
                                                 epiphora_words),
                                    context=intersection))
    return res
def __parse_conjunctive_adverbs_polysyndeton(sent: list, start_count: int,
                                             language: str) -> list:
    """
    :param sent: sentence as list of words
    :param start_count: position of first word in sentence in document
    :return: list of Features
    """
    res = []
    sentence = [w.lower() for w in sent]
    for conj_adv in conjunctive_adverbs(language=language):
        candidates_start = [
        ]  # list of start words of candidates of repeating conjuctive adverbs
        for i in range(len(sentence) - len(conj_adv) + 1):
            if tuple(sentence[i:i + len(conj_adv)]) == conj_adv:
                candidates_start.append(i)
        if len(candidates_start) >= MIN_N_OF_REPETITIONS:
            res.append(
                Feature(feature_type="polysyndeton",
                        words=[
                            start_count + c_pos + con_len
                            for c_pos in candidates_start
                            for con_len in range(len(conj_adv))
                        ],
                        context=[start_count,
                                 start_count + len(sentence) - 1]))
    return res
def __parse_pair_conjunctions_polysyndeton(sent: list, start_count: int,
                                           language: str) -> list:
    """
    :param sent: sentence as list of words
    :param start_count: position of first word in sentence in document
    :return: list of Features
    """
    res = []
    sentence = [w.lower() for w in sent]  # <list> of words
    for conj_word_1, conj_word_2 in pair_conjunctions(language=language):
        positions = set()  # polysyndeton words positions
        first_word_met, first_word_pos = False, -1
        for i, word in enumerate(sentence):
            if word == conj_word_1:
                first_word_met, first_word_pos = True, i
            elif word == conj_word_2 and first_word_met:
                positions.update({first_word_pos, i})
                first_word_met, first_word_pos = False, -1
        if len(positions) // 2 >= MIN_N_OF_REPETITIONS:
            res.append(
                Feature(feature_type="polysyndeton",
                        words=[start_count + i for i in positions],
                        context=[start_count,
                                 start_count + len(sentence) - 1]))
    return res
示例#13
0
def __find_anadiplosis_inside_chapter(chapter: list, start_count: int,
                                      stop_word_check) -> list:
    """
    :param chapter: chapter as list of sentences (as list of words)
    :param start_count: index of first word in chapter
    :return: list of Features
    """
    threshold = 1  # minimal metric
    res = []
    word_count = start_count
    for i in range(len(chapter) - 1):
        candidate = __test_sentences_for_anadiplosis(chapter[i],
                                                     chapter[i + 1],
                                                     stop_word_check)
        if candidate['metric'] >= threshold:
            res.append(
                Feature(feature_type="anadiplosis",
                        words=[
                            n + word_count + len(chapter[i])
                            for n in range(-len(candidate['words']),
                                           len(candidate['words']))
                        ],
                        context=[
                            word_count, word_count + len(chapter[i]) +
                            len(chapter[i + 1]) - 1
                        ]))
        word_count += len(chapter[i])
    return res
示例#14
0
def get_features(auth: AuthData):
    """
    This handler function return features by service or all features.
    """

    if auth.user:
        return Feature.all().serialize()

    features = _set_visible_only(["id", "version"], auth.service.features)
    return features.serialize()
示例#15
0
def get_features(auth: AuthData):
    """
    This handler function return features by service or all features.
    """

    if auth.user:
        return Feature.all().serialize()

    features = _set_visible_only(["id", "version"], auth.service.features)
    return features.serialize()
示例#16
0
def _find_feature(id):
    """
    Find a feature by id
    """

    feature = Feature.find(id)

    if not feature:
        msg = f"Feature not found."
        raise exceptions.NotFound({"message": msg})

    return feature
示例#17
0
def _find_feature(id):
    """
    Find a feature by id
    """

    feature = Feature.find(id)

    if not feature:
        msg = f"Feature not found."
        raise exceptions.NotFound({"message": msg})

    return feature
 def __find_between_sentences(self, first_sentence, second_sentence):
     if first_sentence[0] == second_sentence[-1] and first_sentence[
             -1] == second_sentence[0]:
         self._features.append(
             Feature('chiasmus', [
                 self.word_counter, self.word_counter +
                 len(first_sentence) - 1, self.word_counter +
                 len(first_sentence), self.word_counter +
                 len(first_sentence) + len(second_sentence) - 1
             ], [
                 self.word_counter, self.word_counter +
                 len(first_sentence) + len(second_sentence) - 1
             ]))
示例#19
0
def create_feature(featureData: FeatureCreate):
    """
    Create a new feature
    """

    _check_id_format(featureData.id)
    _check_version_format(featureData.version)

    if Feature.find(featureData.id):
        msg = f"The feature id isn't available. Please try another."
        return http.JSONResponse({"message": msg}, status_code=409)

    _check_services_exists(featureData.services)

    feature = Feature.create(**featureData)
    msg = "Feature created successfully."
    log.info(f"{msg} - ID: {feature.id}")

    feature.update_services(featureData.services)

    headers = {"Content-Location": f"/features/{feature.id}"}
    return http.JSONResponse({"message": msg}, status_code=201, headers=headers)
 def load(self):
     """ Loads Document from specified file """
     with open(self.file_name, "r", encoding='utf8') as file:
         json_doc = json.loads(file.read())
     self.language = json_doc["metadata"]["language"]
     features = [
         Feature(feature['type'], feature['words'], feature['context'],
                 self.__letters_to_int(feature['letters']),
                 feature['transcription'])
         for feature in json_doc["features"]
     ]
     chapters = self.__load_chapters(json_doc["text"])
     stop_words = json_doc["stop_words"]
     return Document(chapters, self.language, features, stop_words)
示例#21
0
def create_feature(featureData: FeatureCreate):
    """
    Create a new feature
    """

    _check_id_format(featureData.id)
    _check_version_format(featureData.version)

    if Feature.find(featureData.id):
        msg = f"The feature id isn't available. Please try another."
        return http.JSONResponse({"message": msg}, status_code=409)

    _check_services_exists(featureData.services)

    feature = Feature.create(**featureData)
    msg = "Feature created successfully."
    log.info(f"{msg} - ID: {feature.id}")

    feature.update_services(featureData.services)

    headers = {"Content-Location": f"/features/{feature.id}"}
    return http.JSONResponse({"message": msg},
                             status_code=201,
                             headers=headers)
示例#22
0
def infer_xgboost_lr(test_data_path,
                     vectorizer_path=None,
                     xgblr_xgb_model_path=None,
                     xgblr_lr_model_path=None,
                     feature_encoder_path=None,
                     col_sep='\t',
                     pred_save_path=None,
                     feature_type='tfidf_char'):
    # load data content
    data_set, test_ids = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data_set,
                      feature_type=feature_type,
                      feature_vec_path=vectorizer_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()
    # load model
    model = XGBLR(xgblr_xgb_model_path, xgblr_lr_model_path,
                  feature_encoder_path)
    # predict
    label_pred = model.predict(data_feature)
    save(label_pred, test_ids, pred_save_path)
    print("finish prediction.")
示例#23
0
def train_classic(model_type,
                  data_path=None,
                  pr_figure_path=None,
                  model_save_path=None,
                  vectorizer_path=None,
                  col_sep=',',
                  thresholds=0.5,
                  num_classes=2,
                  feature_type='tfidf_char'):
    data_content, data_lbl = data_reader(data_path, col_sep)
    # init feature
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=vectorizer_path)
    # get data feature
    data_feature = feature.get_feature()
    # label
    data_label = feature.label_encoder(data_lbl)

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    dump_pkl(model, model_save_path, overwrite=True)
    # evaluate
    eval(model,
         X_val,
         y_val,
         thresholds=thresholds,
         num_classes=num_classes,
         model_type=model_type,
         pr_figure_path=pr_figure_path)
def __parse_simple_conjunctions_polysyndeton(sent: list, start_count: int,
                                             language: str) -> list:
    """
    :param sent: sentence as list of words
    :param start_count: position of first word in sentence in document
    :return: list of Features
    """
    res = []
    sentence = [w.lower() for w in sent]
    for conj in conjunctions(language=language):
        if sentence.count(conj) >= MIN_N_OF_REPETITIONS:
            res.append(
                Feature(feature_type="polysyndeton",
                        words=[
                            start_count + i for i, w in enumerate(sentence)
                            if w == conj
                        ],
                        context=[start_count,
                                 start_count + len(sentence) - 1]))
    return res
def __parse_diacope_inside_sentence(sent: list, start_count: int, excluded_words: set) -> list:
    """
    Parses diacope from the specified sentence

    :param sent: sentence to find diacope from (as list of words)
    :param start_count: index of first word in sentence
    :param excluded_words: set of words which can't be part of diacope
    :return: list with diacope in sentence
    """
    min_diacope_power = 2
    res = []
    for word in set(sent) - excluded_words:
        word_positions = [i for i, w in enumerate(sent) if w == word]
        if len(word_positions) >= min_diacope_power:
            diacope_positions = __diacope_words(word_positions)
            if len(diacope_positions) >= min_diacope_power:
                res.append(Feature("diacope", [start_count + pos for pos in diacope_positions],
                                   [start_count, start_count + len(sent) - 1]))
    res = __merge_diacope_in_nearby_words_inside_sentence(res)
    return res
示例#26
0
def parse_features_collection_json():
    # path_to_file = '/Users/akratovich/projects/python/python_stud/parser/real_json_sample.json'
    path_to_file = '/Users/akratovich/projects/python/sf-city-lots-json/citylots.json'
    parsed_features = []

    features_collection = Reader.read_json(path_to_file)

    for feature_item in features_collection['features']:
        tmp_coord_list = __get_points(feature_item)
        tmp_figure_type = __get__geometry_type(feature_item)

        tmp_geometry = Geometry(tmp_figure_type, tmp_coord_list)

        tmp_feature_type = __get_feature_type(feature_item)
        tmp_prop_list = __get_prop_feature_prop_list(feature_item)
        feature = Feature(tmp_feature_type, tmp_prop_list, tmp_geometry)
        # print(feature.__str__())
        parsed_features.append(feature)
        print('{0} features are parsed'.format(parsed_features.__len__()))
    print('Parsed {0} features'.format(parsed_features.__len__()))
    return parsed_features
def __find_epizeuxis_inside_sentence(sentence: Sentence, start_count: int,
                                     stop_word_check) -> list:
    """Finds epizeuxis in the given sentence

    :param sentence: sentence to find epizeuxis in
    :param start_count: index of first word in sentence
    :param stop_word_check: function checking if word is stop word
    :return: epizeuxis into given sentence (as list of Features)"""
    res = []
    i = 0
    sentence = [word.lower() for word in sentence.words_list]
    while i < len(sentence) - 1:
        repeat_length, n_of_repeats = 0, 0
        for length in range(1, len(sentence) // 2 + 1):
            if sentence[i:i + length] == sentence[i + length:i + length * 2] and \
                    True not in [stop_word_check(word) for word in sentence[i:i + length]]:
                repeat_length, n_of_repeats = length, 2
                break
        if repeat_length:
            for repeats in range(
                    3,
                    len(sentence[i + repeat_length * 2:]) // repeat_length):
                if sentence[i:i + repeat_length] != \
                        sentence[i + repeat_length * (repeats - 1):i + repeat_length * repeats]:
                    break
                n_of_repeats += 1
            res.append(
                Feature("epizeuxis",
                        words=[
                            start_count + i + j
                            for j in range(repeat_length * n_of_repeats)
                        ],
                        context=[start_count,
                                 start_count + len(sentence) - 1]))
            i += repeat_length * n_of_repeats
        else:
            i += 1
    return res
def __find_epizeuxis_between_sentences(chapter: Chapter, start_count: int,
                                       stop_word_check) -> list:
    """Finds epizeuxis between sentences in the given chapter

    :param chapter: chapter to find epizeuxis in
    :param start_count: index of first word in chapter
    :param stop_word_check: function checking if word is stop word
    :return: epizeuxis in given chapter (as list of Features)"""
    res = []
    current_feature = None
    chapter = [[word.lower() for word in sentence.words_list]
               for sentence in chapter.sentences]
    for i in range(len(chapter) - 1):
        if chapter[i] == chapter[i + 1] and True not in [
                stop_word_check(word) for word in chapter[i + 1]
        ]:
            if current_feature:
                current_feature.extend_context(start_count +
                                               len(chapter[i + 1]))
                for j in range(len(chapter[i + 1])):
                    current_feature.add_word(start_count + len(chapter[i]) + j)
            else:
                current_feature = Feature(
                    "epizeuxis",
                    words=[
                        start_count + j
                        for j in range(len(chapter[i]) + len(chapter[i + 1]))
                    ],
                    context=[
                        start_count,
                        start_count + len(chapter[i]) + len(chapter[i + 1])
                    ])
                res.append(current_feature)
        else:
            current_feature = None
        start_count += len(chapter[i])
    return res
示例#29
0
def __find_epiphora_inside_chapter(chapter: Chapter, start_count: int,
                                   stop_word_check) -> list:
    """
        Parses epiphora from chapter

    :param chapter: chapter to find epiphora from (list of sentences as list of words)
    :param start_count: index of first word in chapter
    :param stop_word_check: function checking if word is stop word
    :return: list with epiphora(Feature objects)
    """
    res = []
    word_count = start_count
    current_feature = None
    for i in range(len(chapter.sentences) - 1):
        if __test_sentences_for_epiphora(chapter[i], chapter[i + 1],
                                         stop_word_check):
            if current_feature is None:
                current_feature = Feature(
                    "epiphora",
                    words=[
                        word_count + len(chapter.sentences[i]) - 1,
                        word_count + len(chapter.sentences[i]) +
                        len(chapter.sentences[i + 1]) - 1
                    ],
                    context=[
                        word_count, word_count + len(chapter.sentences[i]) +
                        len(chapter.sentences[i + 1]) - 1
                    ])
                res.append(current_feature)
            else:
                current_feature.add_word(word_count + len(chapter[i]) +
                                         len(chapter[i + 1]) - 1)
                current_feature.extend_context(word_count + len(chapter[i]) +
                                               len(chapter[i + 1]) - 1)
        else:
            current_feature = None
        word_count += len(chapter[i])
    return res
示例#30
0
def train_classic(model_type='logistic_regression',
                  data_path='',
                  model_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word',
                  min_count=1,
                  word_vocab_path='',
                  label_vocab_path='',
                  pr_figure_path=''):
    # load data
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    # save word vocab
    write_vocab(word_vocab, word_vocab_path)
    # label
    label_vocab = build_vocab(data_lbl)
    # save label vocab
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    num_classes = len(set(data_label))
    logger.info('num_classes:%d' % num_classes)

    # init feature
    if feature_type in ['doc_vectorize', 'vectorize']:
        logger.info('feature type error. use tfidf_word replace.')
        feature_type = 'tfidf_word'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      word_vocab=word_vocab)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path=model_save_path)
    else:
        model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    if model_type != 'xgboost_lr':
        dump_pkl(model, model_save_path, overwrite=True)
    # analysis lr model
    if model_type == "logistic_regression" and config.is_debug:
        # show each category top features
        weights = model.coef_
        vectorizer = load_pkl(feature_vec_path)
        logger.debug("20 top features of each category:")
        features = dict()
        for idx, weight in enumerate(weights):
            feature_sorted = sorted(zip(vectorizer.get_feature_names(),
                                        weight),
                                    key=lambda k: k[1],
                                    reverse=True)
            logger.debug("category_" + str(idx) + ":")
            logger.debug(feature_sorted[:20])
            feature_dict = {k[0]: k[1] for k in feature_sorted}
            features[idx] = feature_dict
        dump_pkl(features, 'output/lr_features.pkl', overwrite=True)

    # evaluate
    eval(model,
         X_val,
         y_val,
         num_classes=num_classes,
         pr_figure_path=pr_figure_path)
示例#31
0
def train_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     word_vocab_path='',
                     label_vocab_path='',
                     min_count=1,
                     max_len=300,
                     batch_size=128,
                     nb_epoch=10,
                     embedding_dim=128,
                     hidden_dim=128,
                     col_sep='\t',
                     num_filters=512,
                     filter_sizes='3,4,5',
                     dropout=0.5):
    # data reader
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    write_vocab(word_vocab, word_vocab_path)

    # label
    label_vocab = build_vocab(data_lbl)
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    # category
    num_classes = len(set(data_label))
    logger.info('num_classes:', num_classes)
    data_label = to_categorical(data_label, num_classes=num_classes)
    logger.info('Shape of Label Tensor:', data_label.shape)

    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        logger.info(
            'Hierarchical Attention Network model feature_type must be: doc_vectorize'
        )
        feature_type = 'doc_vectorize'
    else:
        logger.info('feature_type: vectorize')
        feature_type = 'vectorize'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      word_vocab=word_vocab,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'fasttext':
        model = fasttext_model(max_len=max_len,
                               vocabulary_size=len(word_vocab),
                               embedding_dim=embedding_dim,
                               num_classes=num_classes)
    elif model_type == 'cnn':
        model = cnn_model(max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          num_filters=num_filters,
                          filter_sizes=filter_sizes,
                          num_classses=num_classes,
                          dropout=dropout)
    elif model_type == 'rnn':
        model = rnn_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    else:
        model = han_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    cp = ModelCheckpoint(model_save_path,
                         monitor='val_acc',
                         verbose=1,
                         save_best_only=True)
    # fit and save model
    history = model.fit(X_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=nb_epoch,
                        validation_data=(X_val, y_val),
                        callbacks=[cp])
    logger.info('save model:%s' % model_save_path)
    plt_history(history, model_name=model_type)
示例#32
0
             color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.savefig(figure_path)
    return plt


if __name__ == "__main__":
    sys.path.append("../")
    from models.feature import Feature
    from models.reader import data_reader

    data_content, data_lbl = data_reader('../data/train_words.txt', '\t')
    # init feature
    feature = Feature(feature_type='tfidf_word',
                      feature_vec_path='../output/temp')
    # get data feature
    data_feature = feature.get_feature(data_content)
    # label
    data_label = feature.label_encoder(data_lbl)
    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.2)
    search_cv(X_train, y_train, X_val, y_val, model=SVC())

    # test plot_learning_curve
    title = "Learning Curves (Random Forest, n_estimators = 30)"
    estimator = SVC()
    plot_learning_curve(estimator,
                        title,
                        X_train,
示例#33
0
def infer_classic(model_type='xgboost_lr',
                  model_save_path='',
                  label_vocab_path='',
                  test_data_path='',
                  pred_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word'):
    # load data content
    data_set, true_labels = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data_set,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()
    # load model
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path)
    else:
        model = load_pkl(model_save_path)

    # predict
    pred_label_probs = model.predict_proba(data_feature)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}

    pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to:%s" % pred_save_path)
    save(pred_output,
         ture_labels=None,
         pred_save_path=pred_save_path,
         data_set=data_set)
    if 'logistic_regression' in model_save_path and config.is_debug:
        count = 0
        features = load_pkl('output/lr_features.pkl')
        for line in data_set:
            if count > 5:
                break
            count += 1
            logger.debug(line)
            words = line.split()
            for category, category_feature in features.items():
                logger.debug('*' * 43)
                logger.debug(category)
                category_score = 0
                for w in words:
                    if w in category_feature:
                        category_score += category_feature[w]
                        logger.debug("%s:%s" % (w, category_feature[w]))
                logger.debug("%s\t%f" % (category, category_score))
                logger.debug('=' * 43)
    if true_labels:
        # evaluate
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))