Exemplo n.º 1
0
def to_weka_arff(ngram, number_of_features):
  count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True)

  label_list = get_labels()
  tweet_list = get_labelled_tweets()

  features = count_vect.fit_transform(tweet_list)

  features = SelectKBest(chi2, k=number_of_features).fit_transform(features, label_list)
  print features.shape

  arff_data = []

  arff_data.append("@RELATION sport")

  for i in range(features.shape[1]):
    arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL")
  arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}")

  arff_data.append("@DATA")

  array_features = features.toarray()
  for i in range(len(array_features)):
    feature = array_features[i]
    label = label_list[i]
    csv_feature = ",".join(str(x) for x in feature)
    csv_feature = csv_feature + "," + label
    arff_data.append(csv_feature)

  with open('data/sport.arff', 'w') as file:
    for item in arff_data:
      file.write("%s\n" % item)
def to_weka_arff(ngram, number_of_features):
    count_vect = TfidfVectorizer(ngram_range=(1, ngram),
                                 norm='l2',
                                 sublinear_tf=True)

    label_list = get_labels()
    tweet_list = get_labelled_tweets()

    features = count_vect.fit_transform(tweet_list)

    features = SelectKBest(chi2, k=number_of_features).fit_transform(
        features, label_list)
    print features.shape

    arff_data = []

    arff_data.append("@RELATION sport")

    for i in range(features.shape[1]):
        arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL")
    arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}")

    arff_data.append("@DATA")

    array_features = features.toarray()
    for i in range(len(array_features)):
        feature = array_features[i]
        label = label_list[i]
        csv_feature = ",".join(str(x) for x in feature)
        csv_feature = csv_feature + "," + label
        arff_data.append(csv_feature)

    with open('data/sport.arff', 'w') as file:
        for item in arff_data:
            file.write("%s\n" % item)
Exemplo n.º 3
0
def main():

    locations = lrf_config.get_locations()

    ref_data_dir = locations['REF_DATA_PATH']

    x_filename = 'sentiment_data/tweets.txt'
    y_filename = 'sentiment_data/labels.txt'

    ##load and process samples
    print('start loading and process samples...')

    tweets = []
    microblog_features = []
    lexicon_features = []
    tweets_lst = []

    with open(os.path.join(ref_data_dir, x_filename)) as f:

        for i, line in enumerate(f):

            tweet_obj = json.loads(line.strip(), encoding='utf-8')

            # Twitter Text contents
            content = tweet_obj['text'].replace("\n", " ")

            tweets_lst.append(pre_process_lst(content))

            postprocessed_tweet, microblogging_features, mpqa_sentiment_score = pre_process(
                content)

            tweets.append(postprocessed_tweet)

            microblog_features.append(microblogging_features)

            lexicon_features.append(mpqa_sentiment_score)

    lexicon_features = np.asarray(lexicon_features)
    microblog_features = np.asarray(microblog_features)

    tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets_lst,
                                                      ngram_range=2)

    transformed_data_rahul = tf_idf_vectorizer.fit_transform(tweets_lst)
    #
    # tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets,ngram_range=2)
    #
    # transformed_data_mine = tf_idf_vectorizer.fit_transform(tweets)

    with open(os.path.join(ref_data_dir, y_filename)) as f:
        y_data = f.readlines()

    y_data = [y.strip('\n') for y in y_data]
    y_data = np.asarray(y_data)
    num_of_features = 50
    accuracy_in_each_turn = []
    while num_of_features <= 3000:
        X_new = SelectKBest(chi2, k=num_of_features).fit_transform(
            transformed_data_rahul, y_data)

        extended_features_1 = np.append(X_new.toarray(),
                                        lexicon_features,
                                        axis=1)
        extended_features_2 = np.append(extended_features_1,
                                        microblog_features,
                                        axis=1)

        sentiment_map = lrf_config.get_sentiment_map()
        inv_sentiment_map = {str(v): k for k, v in sentiment_map.items()}

        X_data = X_new.toarray()

        kf = KFold(n_splits=5)
        kf.get_n_splits(X_data)
        train_list = []
        test_list = []

        for train_index, test_index in kf.split(X_data):
            X_train = X_data[train_index]
            Y_train = y_data[train_index]
            X_test = X_data[test_index]
            Y_test = y_data[test_index]

            Y_pred, train_acc, test_acc = classifier.classify(
                'svc',
                X_train=X_train,
                Y_train=Y_train,
                X_test=X_test,
                Y_test=Y_test,
                class_map=inv_sentiment_map,
                is_X_text=False)

            # print('_______________________________________________________')
            # print(train_acc)
            # print(test_acc)
            train_list.append(train_acc)
            test_list.append(test_acc)

        # print('Train_Acc : ',np.mean(train_acc))
        # print('Test_Acc : ', np.mean(test_acc))
        accuracy_in_each_turn.append([np.mean(train_acc), np.mean(test_acc)])

    for elem in accuracy_in_each_turn:
        print(elem)
    kmeans_model = KMeans(n_clusters=k).fit(X_new1)
    Kclustering_labels = kmeans_model.labels_
    '''This function returns the Silhouette Coefficient for each sample.   
    The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.'''
    Silhouette = metrics.silhouette_score(X_new1,
                                          Kclustering_labels,
                                          metric='euclidean')
    KSilhouette.append(Silhouette)
    '''normalized_mutual_info_score score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling'''
    normalized_mutual_info_score = metrics.normalized_mutual_info_score(
        classification_labels, Kclustering_labels, average_method='arithmetic')
    Knormalized_mutual_info_score.append(normalized_mutual_info_score)
    'Heirarical bottom up clustering'
    single_linkage_model = AgglomerativeClustering(n_clusters=k,
                                                   linkage='ward').fit(
                                                       X_new1.toarray())
    Hclustering_labels = single_linkage_model.labels_
    '''This function returns the Silhouette Coefficient for each sample.   
    The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.'''
    Silhouette = metrics.silhouette_score(X_new1,
                                          Hclustering_labels,
                                          metric='euclidean')
    HSilhouette.append(Silhouette)
    '''normalized_mutual_info_score score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling'''
    normalized_mutual_info_score = metrics.normalized_mutual_info_score(
        classification_labels, Hclustering_labels, average_method='arithmetic')
    Hnormalized_mutual_info_score.append(normalized_mutual_info_score)

print("Cluster range", n_cluster)
print("Silhouette scores for K-means clustering", KSilhouette)
print("Silhouette scores for AgglomerativeClustering(hierarchical clustering)",
def cluster(training_file):

    feature_vectors, targets = load_svmlight_file(training_file)
    X = feature_vectors
    y = targets
    X_new1 = SelectKBest(chi2, k=1000).fit_transform(X, y)
    X_new1 = X_new1.toarray()

    range_clusters = [
        2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
        22, 23, 24, 25
    ]
    silhouette_score_kmeans = []
    mutual_information_score_kmeans = []
    silhouette_score_agglomerative = []
    mutual_information_score_agglomerative = []

    for n in range_clusters:

        print("for value " + str(n))
        kmeans_model = KMeans(n_clusters=n).fit(X_new1)
        clustering_labels = kmeans_model.labels_
        silhouette_score = metrics.silhouette_score(X_new1,
                                                    clustering_labels,
                                                    metric='euclidean')
        silhouette_score_kmeans.append(silhouette_score)
        print("Kmeans clustering")
        print("silhouette score is " + str(silhouette_score))

        single_linkage_model = AgglomerativeClustering(
            n_clusters=n, linkage='ward').fit(X_new1)
        clustering_labels = single_linkage_model.labels_
        silhouette_score = metrics.silhouette_score(X_new1,
                                                    clustering_labels,
                                                    metric='euclidean')
        silhouette_score_agglomerative.append(silhouette_score)
        print("Hierarchical clustering")
        print("silhouette score is " + str(silhouette_score))

    for n in range_clusters:

        print("for value " + str(n))
        kmeans_model = KMeans(n_clusters=n).fit(X_new1)
        clustering_labels = kmeans_model.labels_
        mutual_information_score = metrics.normalized_mutual_info_score(
            y, clustering_labels)
        mutual_information_score_kmeans.append(mutual_information_score)
        print("Kmeans clustering")
        print("mutual information score is " + str(mutual_information_score))

        single_linkage_model = AgglomerativeClustering(
            n_clusters=n, linkage='ward').fit(X_new1)
        clustering_labels = single_linkage_model.labels_
        mutual_information_score = metrics.normalized_mutual_info_score(
            y, clustering_labels)
        mutual_information_score_agglomerative.append(mutual_information_score)
        print("Hierarchical clustering")
        print("mutual information score is " + str(mutual_information_score))

    f, axarr = plt.subplots(2, sharex=True)
    axarr[0].plot(range_clusters, silhouette_score_kmeans, label="kmeans")
    axarr[0].plot(range_clusters,
                  silhouette_score_agglomerative,
                  label="agglomerative")
    axarr[1].plot(range_clusters,
                  mutual_information_score_kmeans,
                  label="kmeans")
    axarr[1].plot(range_clusters,
                  mutual_information_score_agglomerative,
                  label="agglomerative")
    axarr[1].set_xlabel("X-axis: Number of clusters")
    axarr[0].set_ylabel("Y-axis: silhouette score")
    axarr[1].set_ylabel("Y-axis: Mutual information score")
    axarr[0].set_title("silhouette score")
    axarr[1].set_title("mutual information score")
    f.legend(loc="upper right")
    plt.show()
from sklearn import metrics
from sklearn.cluster import KMeans, AgglomerativeClustering
import matplotlib.pyplot as pyplot
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.feature_selection import SelectKBest

feature_vectors3, targets3 = load_svmlight_file("training_data_file.TFIDF")
numberOfClusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]

kMeansSilhouette = []
kMeansMutualInformation = []
agglomerativeClusteringSilhoutte = []
agglomerativeClusteringMutualInformation = []

topHundredFeatures = SelectKBest(mutual_info_classif, k=100).fit_transform(feature_vectors3, targets3)
topHundredFeatures = topHundredFeatures.toarray()

for i in numberOfClusters:

    kmeans_model = KMeans(n_clusters=i).fit(topHundredFeatures)
    clustering_labels = kmeans_model.labels_
    silhouettescore = metrics.silhouette_score(topHundredFeatures, clustering_labels, metric='euclidean')
    mutualInformationscore = metrics.normalized_mutual_info_score(targets3, clustering_labels)
    kMeansSilhouette.append(silhouettescore)
    kMeansMutualInformation.append(mutualInformationscore)

#for i in numberOfClusters:
    single_linkage_model = AgglomerativeClustering(n_clusters=i, linkage='ward').fit(topHundredFeatures)
    clustering_labels2 = single_linkage_model.labels_
    silhouettescore2 = metrics.silhouette_score(topHundredFeatures, clustering_labels2, metric='euclidean')
    mutualInformationscore2 = metrics.normalized_mutual_info_score(targets3, clustering_labels2)
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn import metrics
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.datasets import load_svmlight_file
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
#Loading training data file
feature_vectors, targets = load_svmlight_file('training_data_file')
#selecting features using CHI squared method
select_features = SelectKBest(chi2,
                              k=1000).fit_transform(feature_vectors, targets)
select_features = select_features.toarray()
#list of number of clusters
num_clusters = []
#list of Silhoutte and Normalised Mutual Information measures for KMeans and Agglomerative clustering
Silhoutte_Kmeans = []
NMI_Kmeans = []
Silhoutte_Agg = []
NMI_Agg = []
#Calculating Silhoutte and Normalised Mutual Information measures for KMeans and Agglomerative clustering for number of clusters ranging from 2 to 25
for n in range(2, 26):
    #print(n)
    num_clusters.append(n)
    kmeans_model = KMeans(n_clusters=n).fit(select_features)
    clustering_labels = kmeans_model.labels_
    Silhoutte_Kmeans.append(
        metrics.silhouette_score(select_features,
                                 clustering_labels,
                                 metric='euclidean'))
Exemplo n.º 8
0
start = time.time()
training_tfidf, test_tfidf = load_svmlight_file('training_data_file.TFIDF')
featureSize = 20000
training_tfidf_chi2 = SelectKBest(chi2, k=featureSize).fit_transform(
    training_tfidf, test_tfidf)

clusterSize = []
kmeans_model_silhouette_score = []
kmeans_model_normalized_mutual_info_score = []
single_linkage_model_silhouette_score = []
single_linkage_model_normalized_mutual_info_score = []
for cluster in range(2, 25):
    clusterSize.append(cluster)
    kmeans_model = KMeans(n_clusters=cluster).fit(training_tfidf_chi2)
    single_linkage_model = AgglomerativeClustering(
        n_clusters=cluster, linkage='ward').fit(training_tfidf_chi2.toarray())
    warnings.filterwarnings("ignore")

    kmeans_model_silhouette_score.append(
        metrics.silhouette_score(training_tfidf_chi2,
                                 kmeans_model.labels_,
                                 metric='euclidean'))
    kmeans_model_normalized_mutual_info_score.append(
        metrics.normalized_mutual_info_score(test_tfidf,
                                             kmeans_model.labels_,
                                             average_method='arithmetic'))

    single_linkage_model_silhouette_score.append(
        metrics.silhouette_score(training_tfidf_chi2,
                                 single_linkage_model.labels_,
                                 metric='euclidean'))
x = vector.fit_transform(x)
print (x.shape)

# remove features with zero variance
selector = VarianceThreshold(threshold=0)
x = selector.fit_transform(x)
print (x.shape)

# select best 300 features using chi2 stats
x = SelectKBest(chi2, k=300).fit_transform(x, y)
print (x.shape)

# Numpy arrays are easy to work with, so convert the result to an
# array and also they are given as input to SVM
x = x.toarray()
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True)

test_dataset = pd.DataFrame(columns=['SampleType'])
test_dataset['SampleType'] = y_test
test_dataset.to_csv('test_file_y.csv', sep=',', encoding='utf-8')
np.savetxt('test_file_x.txt', X_test, fmt='%d')

# tf - idf code
transformer = TfidfTransformer(smooth_idf=False)
tfidf_train = transformer.fit_transform(X_train).toarray()
tfidf_test = transformer.fit_transform(X_test).toarray()

# target names required for per class data
target_names = ["Cardiovascular / Pulmonary", "Orthopedic", "Gastroenterology", "Neurology", "Urology",
                "Obstetrics / Gynecology", "ENT - Otolaryngology", "Hematology - Oncology", "Ophthalmology", "Nephrology"]
Exemplo n.º 10
0
                                     user="******",
                                     password="******",
                                     port=3306,
                                     charset='utf8mb4')  # 链接数据库
                cur = db.cursor()
                cur.execute(insert_data)
            db.commit()

        vectorizer = CountVectorizer(
            max_df=0.6, min_df=5)  # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
        X = vectorizer.fit_transform(corpus)
        word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
        # print(word)
        print(word.__len__())  # 词典的长度
        # print(word.__getitem__(1))#获得词典中特定位置的词
        pd.DataFrame(X.toarray(), columns=word)
        # print(pd.DataFrame(X.toarray(), columns=word)) #输出词频矩阵
        transformer = TfidfTransformer(
        )  # 该类会统计每个词语的tf-idf权值,设置当设置为浮点数时,过滤出现在超过max_df/低于min_df比例的句子中的词语;正整数时,则是超过max_df句句子。这样就可以帮助我们过滤掉出现太多的无意义词语。
        tfidf = transformer.fit_transform(vectorizer.fit_transform(
            corpus))  # 第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
        weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
        # print(weight.__len__())
        weight_data = pd.DataFrame(weight, columns=word)

        print('Start Kmeans:')

        clf = KMeans(n_clusters=33,
                     random_state=1,
                     algorithm='auto',
                     n_init=10,
from sklearn.datasets import load_svmlight_file
feature_vectors, targets = load_svmlight_file("training_data_file.IDF")
from sklearn import metrics
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.pyplot as plt
X_new1 = SelectKBest(chi2, k=1000).fit_transform(feature_vectors, targets)

kmeans_silhoutte_scores = []
kmeans_normalized_mutual_scores = []
krange = range(2, 25)
for n in krange:

    kmeans_model = KMeans(n_clusters=n).fit(X_new1.toarray())
    clustering_labels = kmeans_model.labels_
    kmeans_silhoutte_scores.append(
        metrics.silhouette_score(X_new1.toarray(),
                                 clustering_labels,
                                 metric='euclidean'))
    kmeans_normalized_mutual_scores.append(
        metrics.normalized_mutual_info_score(targets, clustering_labels))

single_linkage_silhoutte_scores = []
single_linkage_normalized_mutual_scores = []

for n in krange:

    single_linkage_model = AgglomerativeClustering(n_clusters=n,
                                                   linkage='ward').fit(
                                                       X_new1.toarray())