def xgboost_model(data, amlclf=None):

    model = {'data': data}

    if byW2v:
        model['w2v'] = config['load_w2v']()
    else:
        token = [
            cont for _, cont in
            [data['corpus'][idx] for idx, *_ in data['TrainData']]
        ]

        # we can append title to doc, but ignore
        # because we want to retain the setence struct,
        # we don't remove stopword in token, we remove them here
        #stopwordTxt = os.path.join('..', 'data', 'stopword', "stopword.txt")
        #data['stop'] = set(open(stopwordTxt, "r").read().split())
        #token = [' '.join([t for t in doc.split() if t not in data['stop']]) for doc in token]

        model['bm25'] = BM25Transformer()
        model['vectorizer'] = TfidfVectorizer()

        model['vectorizer'].fit(token)
        TrainTf = model['vectorizer'].transform(tqdm(token))
        print("fitting bm25...", end='')
        sys.stdout.flush()
        model['bm25'].fit(TrainTf)
        print("transforming...", end='')
        model['TrainBm25'] = model['bm25'].transform(TrainTf)
        print("ok")

        if amlclf:
            vectorizer = amlclf['vectorizer']
            bm25_model = amlclf['bm25']
        else:
            vectorizer = model['vectorizer']
            bm25_model = model['bm25']

    def fetch_setence_by_winsize(cont, idx, size):
        beg = lambda index, offset: max(0, index - offset)
        end = lambda index, offset: min(idx + offset, len(cont))
        return cont[beg(idx, size):idx] + cont[idx + 1:end(idx, size) + 1]

    def fetch_setence_by_punctuation(cont, idx):
        beg, end = idx, idx
        while cont[beg] != '。' and beg > 0:
            beg -= 1
        while cont[end] != '。' and end < len(cont) - 1:
            end += 1
        #print(''.join(cont[beg + 1:end]))
        #beg = max(idx - 7, beg)
        #end = min(idx + 7, end)
        return cont[beg + 1:end]

    model['namefilter'] = namefilter()
    model['reporter'] = set()

    def names_to_bm25(names, cont):

        names = model['namefilter'](names, cont)
        # consider remove stop word
        cont = cont.split()
        #cont = [t for t in cont if t not in data['stop']]

        people, description = [], []

        model['reporter'] |= set(
            [name for name in names if isReporter(name, cont)])

        names = [name for name in names if not isReporter(name, cont)]

        descs_of_name = lambda name, cont: \
                [fetch_setence_by_winsize(cont, idx, 5) for idx in get_indexes(cont, name)]
        #[fetch_setence_by_punctuation(cont, idx) for idx in get_indexes(cont, name)]

        #descs = {name: descs_of_name(name, cont) for name in names}
        name_descs = [(name, descs_of_name(name, cont)) for name in names]
        descs = {nm: dscs for nm, dscs in name_descs if dscs}

        if len(descs) == 0:
            print(names)
            print(name_descs)
            print(cont)
            return [], []

        if amlclf:            descs = {
            name: [amlclf['data']['config']['tokenize'](''.join(desc)).split() \
                     for desc in descs[name]] for name in descs}

        if byW2v: base = np.zeros((model['w2v'].vector_size, ))
        else: base = np.zeros((len(vectorizer.idf_), ))

        if byW2v:
            if descs:
                people, description = map(list, zip(*[(name, np.sum(
                    [np.ravel(np.sum(model['w2v']
                        [[t for t in desc if t in model['w2v']]], axis=0).sum(axis=0)) \
                            for desc in descs[name]],
                        axis = 0
                    ) / len(descs[name])) for name in descs]))
            else:
                people, description = [], []

        else:  #bm25
            if descs:
                people, description = map(
                    list,
                    zip(*[(name,
                           np.sum([
                               np.ravel(
                                   bm25_model.transform(
                                       vectorizer.transform([' '.join(desc)
                                                             ])).sum(axis=0))
                               for desc in descs[name]
                           ] + [base],
                                  axis=0) / max(1, len(descs[name])))
                          for name in descs]))
            else:
                people, description = [], []

        #pprint(list(zip(people, [np.sum(d) for d in description])))
        return people, description

    xtrain_tfv, ytrain, model['ntrain'] = map(
        list,
        zip(*[(desc, name in acc, name) for idx, acc, pred in data['TrainData']
              for _, cont in [data['corpus'][idx]]
              for name, desc in zip(*names_to_bm25(pred, cont))]))

    # start
    if byW2v: svd = decomposition.TruncatedSVD()
    else: svd = decomposition.TruncatedSVD(n_components=200)

    svd.fit(xtrain_tfv)
    xtrain_svd = svd.transform(xtrain_tfv)

    scl = preprocessing.StandardScaler()
    scl.fit(xtrain_svd)
    xtrain_svd_scl = scl.transform(xtrain_svd)

    clf = xgb.XGBClassifier(max_depth=7,
                            n_estimators=200,
                            colsample_bytree=0.8,
                            subsample=0.8,
                            nthread=10,
                            learning_rate=0.1)
    clf.fit(xtrain_svd, ytrain)

    #def documents_to_bm25(tokens):
    #    tf = model['vectorizer'].transform(tqdm(tokens))
    #    print("doing the valid set transformation...", end='')
    #    sys.stdout.flush()
    #    DocData = model['bm25'].transform(tf)
    #    print("ok")
    #    print('tf.shape:', tf.shape)
    #    return DocData

    def validate(xvalid_tfv, show_loss=False):
        if not xvalid_tfv: return []
        xvalid_svd = svd.transform(xvalid_tfv)
        xvalid_svd_scl = scl.transform(xvalid_svd)

        if show_loss:
            predictions = clf.predict_proba(xvalid_svd)
            print("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

        predictions = clf.predict(xvalid_svd)

        return predictions

    def predict(pred, cont, show_loss=False):
        name, desc = names_to_bm25(pred, cont)
        return name, validate(desc, show_loss)

    model['names_to_bm25'] = names_to_bm25
    model['validate'] = validate
    model['predict'] = predict

    return model
Exemplo n.º 2
0
    title_weight = 2

    for i, key in enumerate(tqdm(tokey)):
        title = retain_chinese(titles.get(key, '')).strip()
        if title and title != "Non":
            title_token = ' {}'.format(' '.join([
                w for w in cut_method(title) if w not in stopwords
            ])) * title_weight
            token[i] += title_token
            #print('+= ' + title_token)

    if len(token) != len(tokey):
        print('token len sould eq to tokey len')
        exit(0)

    bm25 = BM25Transformer()
    vectorizer = TfidfVectorizer()
    print("""
    building corpus vector space...
        """)

    doc_tf = vectorizer.fit_transform(tqdm(token))

    bm25.fit(doc_tf)
    doc_bm25 = bm25.transform(doc_tf)

    print('\ncorpus vector space - ok\n')

    docsTokens = [t.split() for t in token]

    print("loading model")
Exemplo n.º 3
0
def new_models(config):

    models = {}

    token = mapTrim(
        open(config['tokenFile'], encoding="UTF-8").read().split('\n'))
    title = mapTrim(
        open(config['titleFile'], encoding="UTF-8").read().split('\n'))

    if len(config['tokey']) != len(token) or len(token) != len(title):
        print('len(token) {} != len(tokey) {}'.format(len(token),
                                                      len(config['tokey'])))
        exit(0)

    # append title to doc
    print("\nappending title to document...\n")

    for i, key in enumerate(tqdm(config['tokey'])):
        if title and title != "Non":
            token[i] += ' {}'.format(title[i]) * title_weight

    print("\nbuilding corpus vector space...\n")

    models['bm25'] = BM25Transformer()
    models['vectorizer'] = TfidfVectorizer()
    doc_tf = models['vectorizer'].fit_transform(tqdm(token))

    print("fitting bm25...", end='')
    sys.stdout.flush()
    models['bm25'].fit(doc_tf)
    print("transforming...", end='')
    models['doc_bm25'] = models['bm25'].transform(doc_tf)
    print("ok")

    print("saving bm25Cache...", end='')
    sys.stdout.flush()
    joblib.dump(models['bm25'], config['bm25Cache'])
    print("ok")
    print("saving docBM25Cache...", end='')
    sys.stdout.flush()
    joblib.dump(models['doc_bm25'], config['docBM25Cache'])
    print("ok")
    print("saving vectorizerCache...", end='')
    sys.stdout.flush()
    joblib.dump(models['vectorizer'], config['vectorizerCache'])
    print("ok")

    print('\ncorpus vector space - ok\n')

    docsTokens = [t.split() for t in token]

    # mod
    print("loading w2v model...", end='')
    sys.stdout.flush()
    models['w2v'] = config['load_w2v']()
    print("ok")

    print("making document word vector")

    models['docWv'] = np.array(
            [np.sum(models['w2v'][[t for t in docsTokens[i] if t in models['w2v']]], axis=0) \
                        for i in tqdm(range(len(docsTokens)))])

    print("saving docW2VCache...", end='')
    sys.stdout.flush()
    joblib.dump(models['docWv'], config['docW2VCache'])
    print("ok")
    return models
Exemplo n.º 4
0
def train(xtrain, ytrain, xval, yval, lang, tags_to_idx, weighting):
    if weighting == 'tfidf':
        path = "./models/model_" + lang + "_weights.hdf5"
    elif weighting == 'bm25':
        path = "./models/model_" + lang + "_bm25_weights.hdf5"
    checkpointer = ModelCheckpoint(filepath=path,
                                   verbose=1,
                                   monitor="val_acc",
                                   save_best_only=True,
                                   mode="max")

    #print("Train and dev shape: ", xtrain.shape, xval.shape)
    counts = defaultdict(int)
    for c in ytrain.tolist():
        counts[c] += 1

    if lang != 'all':
        character_vectorizer = CountVectorizer(analyzer='char',
                                               ngram_range=(3, 6),
                                               lowercase=False,
                                               min_df=5,
                                               max_df=0.3)
    else:
        character_vectorizer = CountVectorizer(analyzer='char_wb',
                                               ngram_range=(3, 5),
                                               lowercase=False,
                                               min_df=5,
                                               max_df=0.3)

    if weighting == 'tfidf':
        transformer = TfidfTransformer(sublinear_tf=True)
    elif weighting == 'bm25':
        transformer = BM25Transformer()

    tfidf_matrix = pipeline.Pipeline([
        ('character',
         pipeline.Pipeline([('s5', text_col(key='text_clean')),
                            ('character_vectorizer', character_vectorizer),
                            ('tfidf_character', transformer)])),
        ('scale', Normalizer())
    ])

    tfidf_matrix = tfidf_matrix.fit(xtrain)
    tfidf_matrix_test = tfidf_matrix.transform(xtrain)
    print('tfidf matrix size: ', tfidf_matrix_test.shape)
    ngrams_matrix_shape = tfidf_matrix_test.shape[1]
    tfidf_matrix_val = tfidf_matrix.transform(xval)

    charvec, char_vocab, max_train_len_char = make_charvec(
        xtrain.text_clean.tolist())
    char_vocab_size = len(char_vocab) + 2
    charvec_shape = charvec.shape[1]
    charvec_val, _, _ = make_charvec(xval.text_clean.tolist(),
                                     train=False,
                                     char_vocab=char_vocab,
                                     max_text_len=max_train_len_char)

    num_classes = len(set(yval.tolist()))

    textmodel_data = ngrams_matrix_shape, num_classes, charvec_shape, char_vocab_size, tfidf_matrix, char_vocab, max_train_len_char, tags_to_idx

    if weighting == 'tfidf':
        data_path = 'models/model_' + lang + '_data.pk'
    elif weighting == 'bm25':
        data_path = 'models/model_' + lang + '_bm25_data.pk'
    with open(data_path, 'wb') as f:
        pickle.dump(textmodel_data, f, protocol=2)

    if lang != 'all':
        if lang not in ['sg', 'ar']:
            num_epoch = 20
        else:
            num_epoch = 80
    else:
        num_epoch = 10

    model = build_model(ngrams_matrix_shape, num_classes, charvec_shape,
                        char_vocab_size)
    model.fit([tfidf_matrix_test, charvec],
              ytrain,
              validation_data=([tfidf_matrix_val, charvec_val], yval),
              batch_size=16,
              epochs=num_epoch,
              verbose=0,
              callbacks=[checkpointer])

    K.clear_session()
    gc.collect()

    return model
def xgboost_model(data):

    model = {'data': data}

    print("\nbuilding corpus vector space...\n")

    model['bm25'] = BM25Transformer()
    model['vectorizer'] = TfidfVectorizer()
    model['vectorizer'].fit(data['TrainData'])
    #data['vectorizer'].fit(ValidToken)

    TrainTf = model['vectorizer'].transform(tqdm(data['TrainData']))

    print("fitting bm25...", end='')
    sys.stdout.flush()
    model['bm25'].fit(TrainTf)
    #data['bm25'].fit(ValidTf)
    print("ok")

    print("transforming...", end='')
    sys.stdout.flush()
    data['TrainData'] = model['bm25'].transform(TrainTf)
    print("ok")
    print('TrainTf.shape:', TrainTf.shape)

    ytrain = data['TrainLabel']
    xtrain_tfv = data['TrainData']

    svd = decomposition.TruncatedSVD(n_components=120)

    svd.fit(xtrain_tfv)
    xtrain_svd = svd.transform(xtrain_tfv)

    scl = preprocessing.StandardScaler()
    scl.fit(xtrain_svd)
    xtrain_svd_scl = scl.transform(xtrain_svd)

    clf = xgb.XGBClassifier(max_depth=7,
                            n_estimators=200,
                            colsample_bytree=0.8,
                            subsample=0.8,
                            nthread=10,
                            learning_rate=0.1)
    clf.fit(xtrain_svd, ytrain)

    def documents_to_bm25(tokens):
        tf = model['vectorizer'].transform(tqdm(tokens))
        print("doing the valid set transformation...", end='')
        sys.stdout.flush()
        DocData = model['bm25'].transform(tf)
        print("ok")
        print('ValidTf.shape:', tf.shape)
        return DocData

    def validate(documents, show_loss=False):
        xvalid_tfv = documents_to_bm25(documents)
        xvalid_svd = svd.transform(xvalid_tfv)
        xvalid_svd_scl = scl.transform(xvalid_svd)

        if show_loss:
            predictions = clf.predict_proba(xvalid_svd)
            print("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

        predictions = clf.predict(xvalid_svd)
        return predictions

    def predict(doc, show_loss=False):
        return validate([data['config']['tokenize'](doc)], show_loss)[0]

    model['validate'] = validate
    model['predict'] = predict

    return model