Пример #1
0
def rf_predictor(incident):
    rf_model = pickle.load(open("Models/rf/randomforest_model.sav", 'rb'))
    cv = pickle.load(open("Models/rf/randomforest_vector.pickel", "rb"))
    tf = pickle.load(open("Models/rf/randomforest_transformer.pickel", "rb"))
    vect_rf = cv.transform(rf_preprocess(incident))
    trans_rf = tf.transform(vect_rf)
    rf_pred = rf_model.predict(trans_rf)[0]
    prob = rf_model.predict_proba(trans_rf)
    rf_prob = str(np.round(np.max(prob) * 100, 2)) + "%"
    return rf_pred, rf_prob
Пример #2
0
def lda_analysis(users):
    global lda_text_to_id, lda_topics_per_text

    n_features = 1000
    n_components = 50
    n_top_words = 20
    print("Constructing user docs")
    X = [[tweet['text'] for tweet in user.tweets] for user in users]
    X = [tweet for sublist in X for tweet in sublist]
    fact_topics = build_fact_topics()

    for t in [' '.join(f) for f in fact_topics['fact_terms'].values]:
        X.append(t)

    print(X[:5])
    print("TF fitting user docs")
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=n_features,
                                    stop_words='english')
    tf = tf_vectorizer.fit(X)
    X_tf = tf.transform(X)

    if NEW_LDA_MODEL:
        print("Training new LDA model")
        lda = LatentDirichletAllocation(n_components=n_components,
                                        max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
        lda.fit(X_tf)
        with open('model_data/lda_model', 'wb') as tmpfile:
            pickle.dump(lda, tmpfile)
    else:
        with open('model_data/lda_model', 'rb') as tmpfile:
            lda = pickle.load(tmpfile)

    lda_text_to_id = {txt: id for id, txt in enumerate(X)}
    lda_topics_per_text = lda.transform(X_tf)

    tf_feature_names = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([
            tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
        ])
        print(message)
    print()
    return lda_text_to_id, lda_topics_per_text
Пример #3
0
def main(_):

    config = Config()
    model = Models(config)
    tf = model.trainTf()
    clf = model.trianModel()
    testData = pd.read_csv(config.FLAGS.trainfile_dir)
    testData["cut"] = testData["scapeOfBesiness"].apply(cut)
    X_test = tf.transform(testData["cut"])
    y_test = clf.predict(X_test)
    testData["secInduCode"] = pd.DataFrame(y_test)
    labels = pd.DataFrame.from_dict(getLabels())
    testData = testData.merge(labels, on="secInduCode", how="left")

    testData.to_csv(
        "D:/[email protected]/Company22.csv", index=None, encoding="utf8")

    print(testData[["companyName", "secnduName"]].head())
vectorizer = TfidfVectorizer()
tf = vectorizer.fit(vocabulary_list)


# print(type(tfidf))
# print(tfidf.toarray())
# print("FRESH")
# for row in tfidf.toarray():
#     print([val for val in row])
score = 0
for i in range(len(q1)):
    x1 = []
    x2 = []
    pred = 0
    x1.append(q1[i])
    x2.append(q2[i])
    q1_tf = tf.transform(x1).toarray()
    q2_tf = tf.transform(x2).toarray()
    sim = cosine_similarity(q1_tf, q2_tf)
    if(sim > 0.7):
        pred = 1
    else:
        pred = 0
    if(pred == dup[i]):
        score+=1
print("acc: ", (score/len(q1))*100)




Пример #5
0
def get_feature_vector(test_data, vect):
    return tf.transform(ptile.transform(vect.transform(test_data[0])))