Exemplo n.º 1
0
    def __init__(self, APTPosData):
        Clustering.__init__(self, APTPosData)

        self.ordered_lst = None  # ordered list is a list of ordered index of corresponding points
        self.RD = None  # reachability distance per point, need ordered lst to put in right clustering order
        self.CD = None  # core distance per point, like RD, also need ordered lst to put in right clustering order
        self.hierarchy_RootNode = None
Exemplo n.º 2
0
def main():
    D = 20
    problem = Problem('problem.txt')
    clusters = Clustering(problem, D)
    clusters.clustering()
    # print(clusters.cluster)
    aif = ArcIndexedFormulation({
        "N": [0, 1, 2, 3, 4],
        "V": [1, 2, 3, 4],
        'k': [5, 5, 5, 5],
        'c': [10, 10, 10, 10, 10],
        's0': [5, 5, 5, 5, 5],
        't': [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5],
              [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]],
        'alpha':
        0.3,
        'T':
        100,
        'U':
        1,
        'L':
        1,
        'f':
        problem.f
    })
    aif.solve()
Exemplo n.º 3
0
def job_function():
    global datasets_json
    global result
    global data_analyst
    global map_result
    try:
        connection = psycopg2.connect(user="******",
                                      password="******",
                                      host="10.60.156.15",
                                      port="8432",
                                      database="test_vtracking")
        cursor = connection.cursor()
        cursor.execute("select * from public.tbl_device ORDER BY RANDOM();")
        datasets_json = pd.DataFrame(cursor.fetchmany(size=50000),
                                     columns=['x', 'y', 'street', 'speed'])
        datasets_json['x'] = datasets_json['x'].apply(lambda x: float(x))
        datasets_json['y'] = datasets_json['y'].apply(lambda x: float(x))
        clustering = Clustering(datasets_json, 20)
        data_analyst = clustering._build()
        map_result = clustering.visualisation(data_analyst)
        result = clustering.export_data(data_analyst)
    except (Exception, psycopg2.Error) as error:
        print("Error while connecting to PostgreSQL", error)
    finally:
        #closing database connection.
        if (connection):
            cursor.close()
            connection.close()
            print("PostgreSQL connection is closed")
Exemplo n.º 4
0
 def __init__(self, ps, state_r3_list, para=Parameters()):
     self.para = para
     self.state_r3_list = state_r3_list
     self.centers_list = [state_r3.value for state_r3 in self.state_r3_list]
     self.clustering = Clustering()
     # list of tuples (starttime(pd.timestamp),endtime(pd.timestamp), cluster center value)
     self.description = self.clustering.ps2description(
         ps=ps, centers=self.centers_list)
Exemplo n.º 5
0
def dbscan(argv):
    if len(argv) == 1:
        print("Executing script with default dataset ./Datasets/datos_1.csv")
        data = pd.read_csv("./Datasets/datos_1.csv")
    else:
        print(f"Executing script using specified dataset {argv[1]}")
        data = pd.read_csv(argv[1])
    cluster = Clustering()
    cluster.DBScan(data)
Exemplo n.º 6
0
 def generateClustering(self, doc, ncls, tofile):
     '''
     generate clustering from doc
     :param doc: document file name
     :param ncls: number of clusters
     :param tofile: file name to save cluster
     :return: save to a file
     '''
     wordFreq = Clustering.freq_dict_from_doc(doc)
     self.word2cls, self.cls2wrd = Clustering.freq_to_cluster(wordFreq, ncls)
     Clustering.write_dict_of_list_to_file(self.cls2wrd, tofile)
Exemplo n.º 7
0
 def __init__(self, path, max_replay_load=10, max_event_parsed=50):
     self.factory = SC2Factory()
     self.folder = path
     self.training_builds = {"Protoss": [], "Terran": [], "Zerg": []}
     self.predict_builds = {"Protoss": [], "Terran": [], "Zerg": []}
     self.training_replays = []
     self.predict_replays = []
     self.success_replay = []
     self.max_replay_load = max_replay_load
     self.max_event_parsed = max_event_parsed
     self.dir_list = os.listdir(self.folder)
     self.clustering = Clustering()
     self.file_handler = FileHandler()
Exemplo n.º 8
0
 def export(self,
            output_directory,
            quick=False,
            drop_annotated_instances=False):
     self.clustering = Clustering(self.instances,
                                  self.getAssignedClusters(),
                                  clustering_algo=self)
     self.clustering.generateClustering(
         self.getAllProba(),
         self.getCentroids(),
         drop_annotated_instances=drop_annotated_instances)
     self.clustering.export(output_directory)
     self.clustering.generateEvaluation(output_directory, quick=quick)
Exemplo n.º 9
0
    def __init__(self, size, features):
        self.size = size
        self.features = features

        self.test_data, t = make_blobs(n_samples=self.size,
                                       n_features=self.features)
        self.test_tensor_data = np.random.random(
            (self.size, self.features, self.features))

        self.kmeans = Clustering(10)
        self.svm = Classification()
        self.gauss = MultivariateGauss()
        self.tensor = TensorDecomposition()
 def clusterWithParameters(self, method):
     self.clusteringParameters.setTab(method)
     result = self.clusteringParameters.exec_()
     if result == QtWidgets.QDialog.Accepted:
         self.classifier = None
         self.featurespace.setClassificationImage(None)
         self.regressor = None
         self.densityEstimator = None
         self.clusterer = Clustering(method, self.clusteringParameters,
                                     self.featurespace)
         try:
             self.clusterer.initialize()
         except AssertionError as e:
             QtWidgets.QMessageBox.warning(self, 'Error', str(e),
                                           QtWidgets.QMessageBox.Ok,
                                           QtWidgets.QMessageBox.Ok)
         self.repaint()
Exemplo n.º 11
0
def test_Clustering_dtype():
    """
    Test that the initialization of a Clustering class throws a type error for 
    things that are not pandas dataframes
    """
    some = "A wrong data type of type string"
    with pytest.raises(TypeError):
        Clustering(some)
Exemplo n.º 12
0
    def run(self):
        bins = self.make_bins(self.data,
                              resolution=self.resolution,
                              overlap=self.overlap)

        nodes = []

        for s in bins.values():
            c = Clustering(data=np.array(s),
                           max_clusters=self.max_clusters,
                           cluster_alg=self.cluster_alg)
            n, groups = c.cluster()

            for g in groups:
                nodes.append(g)

        return self.make_graph(nodes)
def run_clustering_tasks(inverted_index, indexer, root_dir):
    document_vectors = indexer.get_document_vectors(inverted_index)
    num_docs = inverted_index.get_total_docs()
    for linkage in ['min', 'max', 'avg', 'mean']:
        for value in range(5, 100, 5):
            threshold = value / 100
            cluster_name = str(threshold)
            if value % 10 == 0:
                cluster_name += '0'
            print('Using linkage: ', linkage, ' and threshold: ', cluster_name)
            clustering = Clustering(linkage, threshold, document_vectors)
            for doc_id in range(num_docs):
                clustering.add_doc_to_cluster(doc_id)
            clusters = clustering.get_clusters()
            filename = root_dir + '/evaluation/' + linkage + \
                '_linkage_clusters/' + 'cluster-' + cluster_name + '.out'
            generate_clusters_output_file(linkage, threshold, clusters,
                                          filename)
Exemplo n.º 14
0
 def run(self, quick=False, drop_annotated_instances=False):
     if self.experiment.conf.projection_conf is not None:
         try:
             self.projectInstances(quick)
         except FewerThanTwoLabels:
             warnings.warn(
                 'There are too few class labels.'
                 'The instances are not projected before building the clustering.'
             )
     self.compute()
     self.clustering = Clustering(self.experiment,
                                  self.instances,
                                  self.getAssignedClusters(),
                                  clustering_algo=self)
     self.clustering.generateClustering(
         self.getAllProba(),
         self.getCentroids(),
         drop_annotated_instances=drop_annotated_instances)
     self.clustering.generateEvaluation(quick=quick)
Exemplo n.º 15
0
class ClusteringAlgorithm(object):
    def __init__(self, instances, conf):
        self.instances = instances
        self.conf = conf
        self.num_clusters = self.conf.num_clusters
        self.clustering = None

    @abc.abstractmethod
    def getDistortion(self):
        return

    @abc.abstractmethod
    def getCentroids(self):
        return

    @abc.abstractmethod
    def getAssignedClusters(self):
        return

    def getPredictedProba(self):
        return None

    def getAllProba(self):
        return None

    def fit(self):
        self.pipeline = Pipeline([('scaler', StandardScaler()),
                                  ('clustering', self.algo)])
        self.pipeline.fit(self.instances.getFeatures())

    def export(self,
               output_directory,
               quick=False,
               drop_annotated_instances=False):
        self.clustering = Clustering(self.instances,
                                     self.getAssignedClusters(),
                                     clustering_algo=self)
        self.clustering.generateClustering(
            output_directory,
            self.getAllProba(),
            self.getCentroids(),
            drop_annotated_instances=drop_annotated_instances)
        self.clustering.generateEvaluation(output_directory, quick=quick)
Exemplo n.º 16
0
def getDistribitions(ps,redd_hdf5_path='/home/uftp/hubei/30xusuqian.h5',center_path='/home/uftp/hubei/ori/xusuqian',load=False):

    # 数据读入
    # redd_hdf5_path = '/home/uftp/hubei/30xusuqian.h5'
    # datastore=Data_store(redd_hdf5_path='/home/uftp/hubei/30xusuqian.h5')
    datastore = Data_store(redd_hdf5_path=redd_hdf5_path)
    # 读取类中心
    clustering = Clustering(redd_hdf5_path)
    clustering.getCenterDict(center_path)
    # 获取空开总数据
    # ps = clustering.data_store.get_instance_ps(appliance_name='meter', instance='1')
    # 获取空开类中心(state_r3)
    # centers_list=clustering.deal_with_ps(ps=ps,not_deal_off=False)
    # centers_list=clustering.deal_with_ps_b(ps=ps,not_deal_off=False)
    # centers_list.append(0)
    # centers_list = [1310.8869965248609, 1576.0193551020407, 1756.5507647887323, 2228.0983558139533, 2851.1088421052636,
    #                 0]
    # print(centers_list)
    # description=clustering.ps2description(ps=ps,centers=centers_list)
    # print(description)
    # kmeans = KMeans(n_clusters=len(centers_list), random_state=0)
    # centers_array = np.array([[center] for center in centers_list]).reshape(-1, 1)
    # kmeans.cluster_centers_ = centers_array
    # 获取原始state_list
    thepath = center_path
    states_list = getStates_list(thepath=thepath)
    # 提取概率分布并保存为dict
    thedict=clustering.deal_all_instance()
    print(thedict)
    # 将改dict保存在硬盘,省得以后每次都要提取
    #TODO 为了加快运行速度,此处改为从硬盘上读取分布
    serialize_object(thedict,'allappliancesdict')
    # 在硬盘上读取这个dict
    # thedict = deserialize_object('allappliancesdict')
    # 调整states_list,加入distribution
    for state in states_list:
        state.thedict = thedict
        state.feed2distribution()
    return 1, states_list

# getDistribitions()
Exemplo n.º 17
0
def identify_clusters_in_project(project_name, project_path):
    create_logging_folders(project_name)
    temp_json_location = f'{Settings.DIRECTORY}/data/output.json'

    utils.execute_parser(project_path)

    # Read parsed document
    parsed_raw_json = {}
    with open(temp_json_location) as json_file:
        parsed_raw_json = json.load(json_file)

    classes = extract_classes_information_from_parsed_json(parsed_raw_json)

    graph = nx.DiGraph()
    graph = Graph.create_dependencies(classes, graph)

    lda.apply_lda_to_classes(graph, classes)
    calculate_absolute_weights(graph, classes, weight_type=WeightType.LDA)

    # TODO : think about if the pre_processing should be done or not
    graph = Clustering.pre_process(graph,
                                   remove_weak_edges=False,
                                   remove_disconnected_sections=True)

    clusters_results = []
    if Settings.RESOLUTION:
        clusters, modularity = Clustering.community_detection_louvain(
            graph, resolution=Settings.RESOLUTION)
        clusters_results.append((clusters, modularity, Settings.RESOLUTION))
        Clustering.write_modularity_and_services(clusters_results)
    else:
        clusters_results = Clustering.compute_multiple_resolutions(graph,
                                                                   start=0.3,
                                                                   end=1.1,
                                                                   step=0.1)

    # TODO: Reconsider techniques of post-processing
    # clusters = PostProcessing.process(clusters, classes, graph.copy())
    return clusters_results
 def setUp_dynamic(self):
     # select total power data in a period
     # datastore = Data_store(redd_hdf5_path='D:\SJTU\湖北项目\数据\h5s/30xiayurong.h5')
     # datastore = Data_store(redd_hdf5_path='D:\SJTU\湖北项目\数据\h5s/30xusuqian.h5')
     # datastore = Data_store(redd_hdf5_path='/home/uftp/hubei/30xusuqian.h5')
     # datastore = Data_store(redd_hdf5_path='/home/uftp/hubei/30fake.h5')
     pss = []
     # datarange = [pd.Timestamp('2017-12-15 10:00:00'), pd.Timestamp('2017-12-15 12:00:00')]
     appliance_truth = {}
     appliance_consumtion = {}
     for app in self.datastore.appliance_names:
         if (app in ['meter', 'TVbox', 'TV']) and self.home == 'xusuqian':
             if app == 'meter':
                 totalpower = self.datastore.get_instance_ps(
                     appliance_name=app,
                     instance='1').loc[self.datarange[0]:self.datarange[-1]]
             continue
         theps = self.datastore.get_instance_ps(
             appliance_name=app,
             instance='1').loc[self.datarange[0]:self.datarange[-1]]
         appliance_truth[app] = theps
         pss.append(theps)
         appliance_consumtion[app] = Tools.ps_consumption(theps=theps)
         # if(app=='lamp'):
         #     print()
     knownps = aggregate_with_resample(pss)
     appliance_truth['unknown'] = totalpower - knownps
     appliance_consumtion['unknown'] = Tools.ps_consumption(
         theps=appliance_truth['unknown'])
     ps = totalpower
     cluster = Clustering()
     print('miaomiaomiao?')
     del pss
     # ps.plot()
     # ps=median_filter(ps=ps)
     # ps.plot()
     # plt.show()
     # 获得states_list
     from readData.getdistributions import getDistribitions
     self.appliance_truth = appliance_truth
     # centers_list, states_list = getDistribitions(ps=ps, redd_hdf5_path='D:\SJTU\湖北项目\数据\h5s/30xusuqian.h5',
     #                                              center_path='D:\SJTU\湖北项目\数据\ori\\xusuqian')
     # centers_list, states_list = getDistribitions(ps=ps)
     centers_list, states_list = getDistribitions(
         ps=ps,
         redd_hdf5_path='/home/uftp/hubei/30%s.h5' % self.home,
         center_path='/home/uftp/hubei/ori/%s' % self.home,
         load=False)
     self.appliance_consumtion = appliance_consumtion
     self.inference = Inference(total_ps=ps, states_list=states_list)
Exemplo n.º 19
0
def step_clustering(ps, minstep=10):
    clustering = Clustering()
    centers = clustering.deal_with_ps_b(ps=ps)
    centers.sort()
    diff = np.diff(centers)
    if min(diff) < minstep:
        return centers
    else:
        data = ps.values.reshape(-1, 1)
        clustering = KMeans(n_clusters=len(centers) + 1, init='k-means++', n_init=20)
        clustering.fit(data)
        centers = np.copy(clustering.cluster_centers_).reshape(-1)
        centers.sort()
    maxiter = 20
    iternum = 1
    while min(np.diff(centers))> minstep or iternum < maxiter:
        clustering = KMeans(n_clusters=len(centers) + 1, init='k-means++', n_init=20)
        clustering.fit(data)
        centers = np.copy(clustering.cluster_centers_).reshape(-1)
        centers.sort()
        iternum += 1
    return list(centers.reshape(-1))
Exemplo n.º 20
0
def set_weight_for_clustering(graph, class_visitors, topics_per_doc, k):

    class_topics = {
        z[0]: z[1]
        for z in zip(class_visitors.keys(), topics_per_doc)
    }

    for src, dst in graph.edges():
        similarity = 0
        try:

            src_vector = topics_vector(class_topics[src], k)
            dst_vector = topics_vector(class_topics[dst], k)

            if len(src_vector) != 0 and len(dst_vector) != 0:
                similarity = Clustering.cosine_similarity(
                    src_vector, dst_vector)

            graph[src][dst][str(WeightType.LDA)] = similarity
            print(f" {src} -> {dst} similarity of {similarity}")
        except KeyError:
            pass
Exemplo n.º 21
0
class CPUImpl:
    def __init__(self, size, features):
        self.size = size
        self.features = features

        self.test_data, t = make_blobs(n_samples=self.size,
                                       n_features=self.features)
        self.test_tensor_data = np.random.random(
            (self.size, self.features, self.features))

        self.kmeans = Clustering(10)
        self.svm = Classification()
        self.gauss = MultivariateGauss()
        self.tensor = TensorDecomposition()

    def evaluate(self):
        print("Data set: %s samples" % self.size)
        print("Features: %s" % self.features)
        print("======")
        print("KMeans: %s s" % self.kmeans.evaluate(self.test_data))
        print("OneClassSVM: %s s" % self.svm.evaluate(self.test_data, "svm"))
        print("Gauss: %s s" % self.gauss.evaluate(self.test_data))
        print("Parafac: %s s" % self.tensor.evaluate(self.test_tensor_data))
                             color='green')
                ax[i, 0].plot(dfg.index.values,
                              dfg[field + '_intensity_ewma'].values,
                              color='green')
                ax[i, 0].set_title('maturity ' + matu, fontsize=8)
                ax3.plot(dfg.index, dfg[param[field]], color='DarkBlue')
                ax3.plot(dfg.index,
                         dfg[param[field] + '_ewma'],
                         color='DarkBlue')
                if i == 0:
                    legend=ax3.legend([ax[i, 0].get_lines()[0], ax3.get_lines()[0]], \
                       [field + ' intensity', param[field]], bbox_to_anchor=(1, 1), fontsize=12)
        plt.show()


if __name__ == '__main__':
    udl = 'DAI'
    reference_date = '20210105'
    folder1 = 'D:/Users/GitHub/TradesDynamics/processed'
    folder2 = 'D:/Users/GitHub/TradesDynamics/parameters'
    DT = DateAndTime(reference_date, reference_date)
    TF = TradeFlesh(udl, DT, folder1, folder2)
    TF.pct_aggressivity()
    TF.graph_aggressivity('20190710')
    TF.get_intensity()
    TF.graph_sensitivity('vega', '20190710')

    C = Clustering(udl)
    C.prepare_data()
    TF.graph_aggressivity('20190710', C.trades(1))
Exemplo n.º 23
0
##predict data...
import demo
model=demo.stacked_lstm_ae(8,4096,'relu',32,'sgd',0.2,0.1)
model.load_weights(MODEL_FILE)
from tensorflow.python.keras.models import Model
model = Model(inputs=model.inputs, outputs=model.get_layer("encoder").output)
data = model.predict(data)

# Reshape
data = data.reshape(data.shape[1], data.shape[0])
ds = Dataset_transformations(data.T, 1000, data.shape)
if os.path.exists(PREFIX+CONFIG_NAME+'.zip'):
    clust_obj = dataset_utils.load_single(PREFIX+CONFIG_NAME+'.zip')
else:
    print 'Doing kmeans.....'
    clust_obj = Clustering(ds,n_clusters=15,n_init=100,features_first=False)
    clust_obj.batch_kmeans(10)
    print 'Saving .....'
    clust_obj.save(PREFIX+CONFIG_NAME+'.zip')

# Descriptor num_min: 1
num_min = 1
times_pos = closest(clust_obj._link, ds._items, num_min, win=4, t=8, save=False)
np.save(PREFIX+'time_pos_desc'+str(num_min)+'.npy', times_pos)

ns = netCDF_subset(NC_PATH, [500, 700, 900], ['GHT'], timename='Times')
desc_date(clust_obj, ns, times_pos)
clust_obj.save(PREFIX+CONFIG_NAME+'_'+str(num_min)+'.zip')

for c, i in enumerate(times_pos):
    if not os.path.exists(PREFIX+'descriptors1/'):
Exemplo n.º 24
0
class CVManager:
    def __init__(self):
        self.CVList = []
        self.cvsFile = "documentMatrix.csv"
        self.CVFileName = []
        self.fileNamesWithPath = []
        self.cvPostList = []
        self.CVTextColl = []
        self.noOfTopCV = 10
        self.orderedCVList = []
        self.languageProcessing = None
        self.clusteringInfo = None
        self.CVRanker = None

    def list_CVs(self, rootPath):

        for root, dirs, files in os.walk(rootPath):
            for name in files:
                self.CVFileName.append(name)
                self.fileNamesWithPath.append(os.path.join(root, name))
                self.cvPostList.append(
                    os.path.basename(os.path.dirname(os.path.join(root,
                                                                  name))))

    def collectCV(self):
        for cvFilePath, cvFileName, cvPost in zip(self.fileNamesWithPath,
                                                  self.CVFileName,
                                                  self.cvPostList):
            try:
                newCV = CV(cvFileName, cvFilePath, cvPost)
                self.CVList.append(newCV)
            except Exception as e:
                print(cvFileName)
                print("in collection of CV \t" + str(e))

    def collectCVText(self):
        self.CVTextColl = []
        for cv in self.CVList:
            self.CVTextColl.append(cv.textHandeller.cleanText)

    def findDocumentMatrix(self, minFrequency, vocab):
        self.collectCVText()
        self.languageProcessing = NLTKHelper()
        self.languageProcessing.findDocumentMatrix(self.CVTextColl,
                                                   minFrequency, vocab)
        #df = pd.DataFrame(self.languageProcessing.documentMatrix.toarray())
        #df.to_csv(self.cvsFile)
        self.assignFeatureVector()

    def assignFeatureVector(self):
        #         for cv,cvNum in zip(self.CVList,range(0,len(self.CVFileName)-1)):
        #             for featureRow in range(0,len(self.languageProcessing.vocabulary)-1):
        #                 cv.featureVector.append(self.languageProcessing.normalizedFeatureSet[cvNum][featureRow])
        for cv, cvNum in zip(self.CVList, range(len(self.CVFileName))):
            cv.featureVector = self.languageProcessing.normalizedFeatureSet[
                cvNum]
        for cv, cvNum in zip(self.CVList, range(len(self.CVFileName))):
            cv.frequencyVector = self.languageProcessing.documentMatrix.toarray(
            )[cvNum]
            #frequencyVector
    def makeGraph(self, data):
        lists = sorted(data)  # sorted by key, return a list of tuples
        x, y = zip(*lists)  # unpack a list of pairs into two tuples
        plt.plot(x, y)
        plt.show()

    def clusterData(self):
        self.clusteringInfo = Clustering()
        self.clusteringInfo.clusterData(
            self.languageProcessing.normalizedFeatureSet)
        self.makeGraph(self.clusteringInfo.silCoeffInfo.items())

    def rankCV(self):
        self.CVRanker = CBRAlgo()
        self.CVRanker.calculateCVScore(self.languageProcessing.documentMatrix,
                                       self.languageProcessing.vocabulary,
                                       self.clusteringInfo)
        for cv, cvScore in zip(self.CVList, self.CVRanker.CVScoreList):
            cv.score = cvScore

    def showAnalytics(self):
        self.CVRanker.plotTopWordsPerCluster()
        self.CVRanker.plotOverAllWeight(self.languageProcessing.vocabulary)

    def showTopCVPerPost(self, post):
        cvlist = {}
        cvScore = []
        cvData = []
        if post is None:
            for cvCategery in set(self.cvPostList):
                cvlist = {}
                print("cv of %s" % cvCategery)
                for cv in self.CVList:
                    if (cv.CVCategory == cvCategery):
                        cvlist.update({cv: cv.score})
                temp = [(value, key) for key, value in cvlist.items()]
                temp.sort()
                temp.reverse()
                temp = [(key, value) for value, key in temp]
                cvData = temp
                return cvData
        else:
            try:
                print("cv of post %s" % post)
                for cv in self.CVList:
                    if (cv.CVCategory == post):
                        cvlist.update({cv: cv.score})
                temp = [(value, key) for key, value in cvlist.items()]
                temp.sort()
                temp.reverse()
                temp = [(key, value) for value, key in temp]
                cvData = temp
                return cvData
            except Exception as e:
                print(cv.fileName)
                print("finding top CV \t" + str(e))

    def compareCV(self):
        topCVNum = 2
        cvIndex = 0
        #print(self.orderedCVList)
        for cvSample in self.orderedCVList[:topCVNum]:
            xValue = np.arange(len(cvSample[cvIndex].frequencyVector))
            plt.plot(xValue,
                     cvSample[cvIndex].frequencyVector,
                     markersize=10,
                     label=cvSample[cvIndex].fileName + '=' +
                     str(cvSample[cvIndex].score))
            #plt.rcParams["figure.figsize"] = (10,20)
        plt.title('CV comparision top cv')
        plt.ylabel('Feature Vector Frequency')
        plt.legend(mode="expand")
        plt.xlabel('Relevant words index')
        #plt.rcParams["figure.figsize"] = (10,10)
        plt.show()
        for cvSample in self.orderedCVList[int(len(self.orderedCVList) / 2) -
                                           1:int(len(self.orderedCVList) / 2) +
                                           1]:
            xValue = np.arange(len(cvSample[cvIndex].frequencyVector))
            plt.plot(xValue,
                     cvSample[cvIndex].frequencyVector,
                     markersize=10,
                     label=cvSample[cvIndex].fileName + '=' +
                     str(cvSample[cvIndex].score))
        plt.title('CV comparision middle CV')
        plt.ylabel('Feature Vector Frequency')
        plt.legend(mode="expand")
        plt.xlabel('Relevant words index')
        #plt.rcParams["figure.figsize"] = (10,10)
        plt.show()
        for cvSample in self.orderedCVList[-2:]:
            xValue = np.arange(len(cvSample[cvIndex].frequencyVector))
            plt.plot(xValue,
                     cvSample[cvIndex].frequencyVector,
                     markersize=10,
                     label=cvSample[cvIndex].fileName + '=' +
                     str(cvSample[cvIndex].score))
        plt.title('CV comparision last CV')
        plt.ylabel('Feature Vector Frequency')
        plt.legend(mode="expand")
        plt.xlabel('Relevant words index')
        #plt.rcParams["figure.figsize"] = (10,10)
        plt.show()
Exemplo n.º 25
0
 def clusterData(self):
     self.clusteringInfo = Clustering()
     self.clusteringInfo.clusterData(
         self.languageProcessing.normalizedFeatureSet)
     self.makeGraph(self.clusteringInfo.silCoeffInfo.items())
Exemplo n.º 26
0
class ReplayParser:
    def __init__(self, path, max_replay_load=10, max_event_parsed=50):
        self.factory = SC2Factory()
        self.folder = path
        self.training_builds = {"Protoss": [], "Terran": [], "Zerg": []}
        self.predict_builds = {"Protoss": [], "Terran": [], "Zerg": []}
        self.training_replays = []
        self.predict_replays = []
        self.success_replay = []
        self.max_replay_load = max_replay_load
        self.max_event_parsed = max_event_parsed
        self.dir_list = os.listdir(self.folder)
        self.clustering = Clustering()
        self.file_handler = FileHandler()

    def run(self):
        self.train()
        self.predict()

    def train(self):
        print('Begin replay loading for training...')
        self.load_replays(self.max_replay_load, self.training_replays, 0)
        print('Done\n')
        print('Begin replay parsing for training...')
        for replay in self.training_replays:
            self.parse_replay(replay, self.training_builds)
        print('Done\n')
        print('Number of builds parsed for training: ',
              self.number_of_builds(self.training_builds))
        self.train_cluster_builds()

    def predict(self, max_replays=50):
        print("Begin replay loading for prediction...")
        self.load_replays(max_replays + 10, self.predict_replays, 10)
        print("Done\n")
        print("Begin replay parsing for prediction...")
        for replay in self.predict_replays:
            self.parse_replay(replay, self.predict_builds)
        print("Done\n")
        print('Number of builds parsed for prediction: ',
              self.number_of_builds(self.predict_builds))
        build_dict = self.predict_cluster_builds()
        print(build_dict)
        self.file_handler.put_builds_in_folders(build_dict)

    def number_of_builds(self, build_dict):
        return sum([
            len(build_dict["Protoss"]),
            len(build_dict["Terran"]),
            len(build_dict["Zerg"])
        ])

    def load_replays(self, max_replays, replay_array, count):
        while count < max_replays:
            filename = self.dir_list[count]
            try:
                # print("Loading replay : ", filename)
                replay = SC2Factory.load_replay(
                    self.factory, os.path.join(self.folder, filename))
                replay_array.append(replay)
            except UnicodeDecodeError:
                pass
            count += 1

    def parse_replay(self, replay, build_dict):
        for team in replay.teams:
            for player in team.players:
                if player.detail_data['race'] == 'Protoss':
                    parser = self.get_parser(player)
                    engine.register_plugin(SupplyTracker())
                    engine.register_plugin(parser)
                    engine.run(replay)
                    self.update_builds(player, parser, replay, build_dict)

    def print_builds(self):
        for build in self.training_builds["Protoss"]:
            print(build)

    def train_cluster_builds(self):
        self.clustering.train(self.training_builds["Protoss"])

    def predict_cluster_builds(self):
        return self.clustering.predict(self.predict_builds["Protoss"],
                                       self.predict_replays)

    def update_builds(self, player, parser, replay, build_dict):
        if len(parser.numeric_tuples) == self.max_event_parsed:
            self.success_replay.append(replay)
            if player.detail_data['race'] == 'Protoss':
                build_dict["Protoss"].append(parser.numeric_tuples)
            elif player.detail_data['race'] == 'Terran':
                build_dict["Terran"].append(parser.numeric_tuples)
            if player.detail_data['race'] == 'Zerg':
                build_dict["Zerg"].append(parser.numeric_tuples)

    def get_parser(self, player):
        race = player.detail_data['race']
        if race == 'Protoss':
            return ProtossParser(player, self.max_event_parsed)
        elif race == 'Terran':
            return TerranParser(player, self.max_event_parsed)
        else:
            return ZergParser(player, self.max_event_parsed)
class CVManager:
    def __init__(self):
        self.CVList=[]
        self.cvsFile="documentMatrix.csv"
        self.CVFileName=[]
        self.fileNamesWithPath=[]
        self.cvPostList=[]
        self.CVTextColl=[]
        self.noOfTopCV=10
        self.orderedCVList=[]
        self.languageProcessing=None
        self.clusteringInfo=None
        self.CVRanker=None
    def list_CVs(self,rootPath):
          
        for root, dirs, files in os.walk(rootPath):
            for name in files:
                self.CVFileName.append(name)
                self.fileNamesWithPath.append(os.path.join(root, name))
                self.cvPostList.append(os.path.basename(os.path.dirname(os.path.join(root,name))))
               
    def collectCV(self):
            for cvFilePath,cvFileName,cvPost in zip(self.fileNamesWithPath,self.CVFileName,self.cvPostList):
                try:
                    newCV=CV(cvFileName,cvFilePath,cvPost)
                    self.CVList.append(newCV)
                except Exception as e:
                    print(cvFileName)
                    print("in collection of CV \t"+str(e))
            
    def collectCVText(self):
        self.CVTextColl=[]
        for cv in self.CVList:
            self.CVTextColl.append(cv.textHandeller.cleanText) 
    def findDocumentMatrix(self,minFrequency,vocab):
            self.collectCVText()
            self.languageProcessing=NLTKHelper()
            self.languageProcessing.findDocumentMatrix(self.CVTextColl,minFrequency,vocab)
            #df = pd.DataFrame(self.languageProcessing.documentMatrix.toarray())
            #df.to_csv(self.cvsFile)
            self.assignFeatureVector()  
    def assignFeatureVector(self):
#         for cv,cvNum in zip(self.CVList,range(0,len(self.CVFileName)-1)):
#             for featureRow in range(0,len(self.languageProcessing.vocabulary)-1):
#                 cv.featureVector.append(self.languageProcessing.normalizedFeatureSet[cvNum][featureRow])
        for cv,cvNum in zip(self.CVList,range(len(self.CVFileName))):
            cv.featureVector=self.languageProcessing.normalizedFeatureSet[cvNum]
        for cv,cvNum in zip(self.CVList,range(len(self.CVFileName))):
            cv.frequencyVector=self.languageProcessing.documentMatrix.toarray()[cvNum]
            #frequencyVector
    def makeGraph(self,data):
        lists = sorted(data) # sorted by key, return a list of tuples
        x, y = zip(*lists) # unpack a list of pairs into two tuples
        plt.plot(x, y)
        plt.show()
        
    def clusterData(self):
        self.clusteringInfo=Clustering()
        self.clusteringInfo.clusterData(self.languageProcessing.normalizedFeatureSet)
        self.makeGraph(self.clusteringInfo.silCoeffInfo.items())
    def rankCV(self):
        self.CVRanker=CBRAlgo()
        self.CVRanker.calculateCVScore(self.languageProcessing.documentMatrix,self.languageProcessing.vocabulary,self.clusteringInfo)
        for cv,cvScore in zip(self.CVList,self.CVRanker.CVScoreList):
            cv.score=cvScore
    def showAnalytics(self):
        self.CVRanker.plotTopWordsPerCluster()
        self.CVRanker.plotOverAllWeight(self.languageProcessing.vocabulary)
    def showTopCVPerPost(self,post):
        cvlist={}
        cvScore=[]
        cvData=[]
        if post is None:
            for cvCategery in set(self.cvPostList):
                cvlist={}
                print("cv of %s"%cvCategery)
                for cv in self.CVList:
                    if(cv.CVCategory==cvCategery):
                        cvlist.update({cv:cv.score})
                temp=[(value,key) for key,value in cvlist.items()]
                temp.sort()
                temp.reverse()
                temp=[(key,value) for value,key in temp]
                cvData=temp
                return cvData
        else:
            try:
                print("cv of post %s"%post)
                for cv in self.CVList:
                    if(cv.CVCategory==post):
                        cvlist.update({cv:cv.score})
                temp=[(value,key) for key,value in cvlist.items()]
                temp.sort()
                temp.reverse()
                temp=[(key,value) for value,key in temp]
                cvData=temp
                return cvData
            except Exception as e:
                    print(cv.fileName)
                    print("finding top CV \t"+str(e))
            
    def compareCV(self):
        topCVNum=2
        cvIndex=0
        #print(self.orderedCVList)
        for cvSample in self.orderedCVList[:topCVNum]:
            xValue = np.arange(len(cvSample[cvIndex].frequencyVector))
            plt.plot(xValue,cvSample[cvIndex].frequencyVector, markersize = 10,label=cvSample[cvIndex].fileName+'='+str(cvSample[cvIndex].score))
            #plt.rcParams["figure.figsize"] = (10,20)
        plt.title('CV comparision top cv')
        plt.ylabel('Feature Vector Frequency')
        plt.legend(mode="expand")
        plt.xlabel('Relevant words index')
        #plt.rcParams["figure.figsize"] = (10,10)
        plt.show()
        for cvSample in self.orderedCVList[int(len(self.orderedCVList)/2)-1:int(len(self.orderedCVList)/2)+1]:
            xValue = np.arange(len(cvSample[cvIndex].frequencyVector))
            plt.plot(xValue,cvSample[cvIndex].frequencyVector, markersize = 10,label=cvSample[cvIndex].fileName+'='+str(cvSample[cvIndex].score))
        plt.title('CV comparision middle CV')
        plt.ylabel('Feature Vector Frequency')
        plt.legend(mode="expand")
        plt.xlabel('Relevant words index')
        #plt.rcParams["figure.figsize"] = (10,10)
        plt.show()
        for cvSample in self.orderedCVList[-2:]:
            xValue = np.arange(len(cvSample[cvIndex].frequencyVector))
            plt.plot(xValue,cvSample[cvIndex].frequencyVector, markersize = 10,label=cvSample[cvIndex].fileName+'='+str(cvSample[cvIndex].score))
        plt.title('CV comparision last CV')
        plt.ylabel('Feature Vector Frequency')
        plt.legend(mode="expand")
        plt.xlabel('Relevant words index')
        #plt.rcParams["figure.figsize"] = (10,10)
        plt.show()
 def clusterData(self):
     self.clusteringInfo=Clustering()
     self.clusteringInfo.clusterData(self.languageProcessing.normalizedFeatureSet)
     self.makeGraph(self.clusteringInfo.silCoeffInfo.items())
Exemplo n.º 29
0
class ClusteringAlgorithm(object):
    def __init__(self, instances, experiment):
        self.instances = instances
        self.experiment = experiment
        self.num_clusters = experiment.conf.num_clusters
        self.clustering = None

    @abc.abstractmethod
    def getDistortion(self):
        return

    @abc.abstractmethod
    def getCentroids(self):
        return

    @abc.abstractmethod
    def getAssignedClusters(self):
        return

    def getPredictedProba(self):
        return None

    def getAllProba(self):
        return None

    def run(self, quick=False, drop_annotated_instances=False):
        if self.experiment.conf.projection_conf is not None:
            try:
                self.projectInstances(quick)
            except FewerThanTwoLabels:
                warnings.warn(
                    'There are too few class labels.'
                    'The instances are not projected before building the clustering.'
                )
        self.compute()
        self.clustering = Clustering(self.experiment,
                                     self.instances,
                                     self.getAssignedClusters(),
                                     clustering_algo=self)
        self.clustering.generateClustering(
            self.getAllProba(),
            self.getCentroids(),
            drop_annotated_instances=drop_annotated_instances)
        self.clustering.generateEvaluation(quick=quick)

    def projectInstances(self, quick):
        projection_exp = self.createProjectionExperiment()
        algo = projection_exp.conf.algo
        projection = algo(projection_exp)
        instances = projection.getFittingInstances(self.instances)
        projection.fit(instances, visu=not quick)
        self.instances = projection.transform(self.instances,
                                              visu=not quick,
                                              performance=not quick)

    def compute(self):
        self.pipeline = Pipeline([('scaler', StandardScaler()),
                                  ('clustering', self.algo)])
        self.pipeline.fit(self.instances.getFeatures())

    def createProjectionExperiment(self):
        exp = self.experiment
        name = '-'.join([exp.experiment_name, 'projection'])
        projection_conf = exp.conf.projection_conf
        projection_exp = ProjectionExperiment(
            exp.project,
            exp.dataset,
            exp.db,
            exp.cursor,
            experiment_name=name,
            experiment_label=exp.experiment_label,
            parent=exp.experiment_id)
        projection_exp.setConf(projection_conf)
        projection_exp.setFeaturesFilenames(exp.features_filenames)
        projection_exp.createExperiment()
        projection_exp.export()
        return projection_exp
Exemplo n.º 30
0
    #           str += v
    #       # print(str)
    #       nvarin.append(str)
    # times = []
    # for var in nvarin:
    #     under_split = var.split('_')
    #     date_split = under_split[0].split('-')
    #     time_split = under_split[1].split(':')
    #     date_object = datetime.datetime(int(date_split[0]), int(date_split[1]), int(date_split[2]), int(time_split[0]), int(time_split[1]))
    #     times.append(date_object)
    # print times[0:10]

    # print(len(times))

    # print ds._items.shape
    clust_obj = Clustering(ds, n_clusters=15, n_init=100, features_first=False)
    clust_obj.batch_means()
    # clust_obj.create_density_descriptors(8,times) # 8 6hour snapshot = 2 days
    clust_obj.create_km2_descriptors(12)

    sys.path.insert(1, '../wrfhy/wrfvol/')
    os.chdir("../wrfhy/wrfvol/")

    export_template = netCDF_subset('40years.nc', [700], ['GHT'],
                                    lvlname='num_metgrid_levels',
                                    timename='Times')
    export_template._time_name = 'Times'

    sys.path.insert(1, '../../final_eval/')
    os.chdir("../../final_eval/")
    a = os.getcwd()
Exemplo n.º 31
0
                 for v, vn in list(zip(comb, combo_name))]) + "-Agg" + str(agg)
            vals.append(param)
            gb.K = K

            signalsValues, modes = ArtificialData(
                noise=N, ptrn=P).run()  #parts=30) # VS, ES, APP, BPP, ECT
            sigReaders = [
                SignalReaderArtificial(signame="Signal" + str(i),
                                       sigvalues=values,
                                       modes=modes)
                for i, values in enumerate(signalsValues)
            ]
            app = App(sigReaders)

            DATA, AXES_INFO = app.build_features_data()
            clust = Clustering(DATA, scale=True, features=None).gmm(
                k=3)  #k=gb.K) # kmeans, dpgmm, gmm
            app.init_clust_tracker(clust, AXES_INFO)

            PLOT_PATH = gb.PLOT_PATH + str(id_combin) + '/' + str(param) + '/'
            if not os.path.exists(PLOT_PATH): os.makedirs(PLOT_PATH)
            path = PLOT_PATH + str(id_combin) + '_'
            app.logInformations(id_combin=id_combin, clust=clust, path=path)

            #(ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps) = app.tracking(path=path)
            STATS, RESULTS = app.tracking(path=gb.PLOT_PATH, )
            times_fsp, axes_fsp, labels_fsp, times_ssp, axes_ssp, labels_ssp = RESULTS
            (ari_fps,
             ari_sps), (ami_fps,
                        ami_sps), (ho_fps, ho_sps), (com_fps,
                                                     com_sps), (vm_fps,
                                                                vm_sps) = STATS
Exemplo n.º 32
0
    ds = Dataset_transformations(items, 1000)
    print ds._items.shape
    times = export_template.get_times()
    nvarin = []
    for var in export_template.get_times():
        str = ""
        for v in var:
            str += v
        nvarin.append(str)
    times = []
    for var in nvarin:
        under_split = var.split('_')
        date_split = under_split[0].split('-')
        time_split = under_split[1].split(':')
        date_object = datetime.datetime(int(date_split[0]), int(date_split[1]),
                                        int(date_split[2]), int(time_split[0]),
                                        int(time_split[1]))
        times.append(date_object)
    print times[0:10]
    clust_obj = Clustering(ds, n_clusters=16, n_init=100, features_first=False)
    clust_obj.kmeans_plus(init=seed)
    print clust_obj._centroids.shape
    # clust_obj.create_km2_descriptors(12)
    clust_obj.create_density_descriptors(12, times)
    export_template = netCDF_subset(NC_PATH, [700], ['GHT'],
                                    lvlname='num_metgrid_levels',
                                    timename='Times')
    clust_obj.mult_desc_date(export_template)
    utils.export_descriptor_mult_dense(outp, export_template, clust_obj)
    clust_obj.save(PREFIX + '_mult_dense.zip')
Exemplo n.º 33
0
def main():
    clusterEngine = Clustering(1000, 10)
    clusterEngine.parseInput('input.txt')
    clusterEngine.initClusters()
    clusterEngine.initClusterPairProbas()
    clusterEngine.getInitialPartitionScore()

    for i in range(8):
        clusterEngine.mergeClustersWithBestPotential()

    clusterEngine.printOutClusters('output.txt')
Exemplo n.º 34
0
 def getCenters(self, ps):
     clustering = Clustering()
     # centers_list = clustering.deal_with_ps(ps=ps, not_deal_off=False)
     centers_list = clustering.deal_with_ps_b(ps=ps, not_deal_off=False)
     centers_list.append(0)
     return centers_list
Exemplo n.º 35
0
#################  Preprocess for clustering   ####################
#get text of each tweet
tweet_text = twt.get_tweet_text(json_tweets)
for tweet in tweet_text:
    tweet_text[tweet_text.index(tweet)] = " ".join(
                                                Preprocess().remove_stopwords(
                                                    TweetTokenizer().tokenize(
                                                        Preprocess().expand_contraction(tweet))))




########################   Clustering     #########################

#####    k-means   ######
clr = Clustering()

best_k = clr.gap_statistic(tweet_text, kmin=2, kmax=10)
clr.best_kmeans(best_k, tweet_text)
clr.set_tweet_topic(json_tweets)
twt.json_tweets = json_tweets
twt.save_tweets(file)

#present topic popularity evolution and its polarity evolution over the time
present_all_data(best_k, json_tweets)


##### Htag clussifier ###
Ht = HtagClassifier()
plot(json_tweets)