Exemplo n.º 1
0
    def testCompleteLinkage(self):
        "Basic Hierarchical Clustering test with integers"
        cl = HierarchicalClustering(self.__data,
                                    lambda x, y: abs(x - y),
                                    linkage='complete')
        result = cl.getlevel(40)

        # sort the values to make the tests less prone to algorithm changes
        result = sorted([sorted(_) for _ in result])

        expected = [
            [24],
            [84],
            [124, 131, 134],
            [336, 365, 365],
            [391, 398],
            [518],
            [542, 564],
            [594],
            [676],
            [791],
            [835],
            [940, 956, 971],
        ]
        self.assertEqual(result, expected)
Exemplo n.º 2
0
def getSubbatch(images, image_labels, similar_thred):
    sizes = [(image.shape[0], image.shape[1], idx)
             for idx, image in enumerate(images)]
    cl = HierarchicalClustering(
        sizes, lambda x, y: abs(x[0] - y[0]) + abs(x[1] - y[1]))
    clusters = cl.getlevel(similar_thred)
    subbatches = []
    sorted(clusters, key=lambda cluster: len(cluster))
    for cluster in clusters:
        if len(cluster) > 1:
            ideal_size = np.median(cluster, axis=0)
            ideal_size = [int(i) for i in ideal_size]
            subbatch_im = []
            subbatch_label = []
            for img in cluster:
                if img[0] != ideal_size[0] or img[1] != ideal_size[1]:
                    subbatch_im.append(
                        cv2.resize(images[img[2]],
                                   (ideal_size[1], ideal_size[0])))
                    subbatch_label.append(
                        cv2.resize(image_labels[img[2]],
                                   (ideal_size[1], ideal_size[0])))
                else:
                    subbatch_im.append(images[img[2]])
                    subbatch_label.append(image_labels[img[2]])
            subbatches.append({
                'images': np.array(subbatch_im),
                'labels': np.array(subbatch_label)
            })
        else:
            subbatches.append({
                'images': np.array([images[cluster[0][2]]]),
                'labels': np.array([image_labels[cluster[0][2]]])
            })
    return subbatches
Exemplo n.º 3
0
 def testDataTypes(self):
     "Test for bug #?"
     cl = HierarchicalClustering(self.__data, self.sim)
     for item in cl.getlevel(0.5):
         self.assertEqual(
                 type(item), type([]),
                 "Every item should be a list!")
Exemplo n.º 4
0
 def testClusterLen1(self):
     """
     Testing if hierarchical clustering a set of length 1 returns a set of
     length 1
     """
     cl = HierarchicalClustering([876], lambda x, y: abs(x - y))
     self.assertEqual([876], cl.getlevel(40))
Exemplo n.º 5
0
 def testClusterLen1(self):
     """
     Testing if hierarchical clustering a set of length 1 returns a set of
     length 1
     """
     cl = HierarchicalClustering([876], lambda x, y: abs(x - y))
     self.assertCItemsEqual([876], cl.getlevel(40))
Exemplo n.º 6
0
 def testMultiprocessing(self):
     cl = HierarchicalClustering(self.__data,
                                 lambda x, y: abs(x - y),
                                 num_processes=4)
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(40)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
Exemplo n.º 7
0
def buildHcluster(data, threshold):
	"""
	Description:Build Hierachical Cluster
	Input:
			data: e.g. data = [	[12,12],[34,34],
								[23,23],[32,32],
								[46,46],[96,96],
								[13,13],[1,1],
								[4,4],[9,9]] 
								# The first variable is key, not counted for clustering
			threshold: threshold distance to break cluster
	Output: cluster record file  /searchc/save/H.cluster
	"""
	print "Clustering..."
	a = datetime.datetime.now()
	cl = HierarchicalClustering(data,distance_function,'complete')
	clusterH =  cl.getlevel(threshold)	     			# get h clusters
	b = datetime.datetime.now()
	print "Naming..."
	featureAll = readFeature('all')
	c = nameCluster(clusterH,featureAll)
	name = c[0]
	centroid = c[1]
	writeCluster('H',clusterH,name,centroid,threshold)
	print "Writing..."
	with open(path+'/log/H_'+str(threshold)+'.log','w') as outfile:
		outfile.write("Hierahical Clustering Log\nDate:\t"+str(a.date())+"\nStart:\t"+str(a.time())+"\nEnd:\t"+str(b.time())+"\nDuration:\t"+str(b-a)+"\nH:\t"+str(threshold)+"\nMethod:\tComplete"+"\nNo. cluster:\t"+str(len(clusterH))+"\n\n")
		for cluster in clusterH:
			outfile.write(str(len(cluster)-2)+"\n")
	
	return
Exemplo n.º 8
0
    def testCompleteLinkage(self):
        "Basic Hierarchical Clustering test with integers"
        cl = HierarchicalClustering(self.__data,
                                    lambda x, y: abs(x - y),
                                    linkage='complete')
        result = cl.getlevel(40)

        # sort the values to make the tests less prone to algorithm changes
        result = sorted([sorted(_) for _ in result])

        expected = [
            [24],
            [84],
            [124, 131, 134],
            [336, 365, 365],
            [391, 398],
            [518],
            [542, 564],
            [594],
            [676],
            [791],
            [835],
            [940, 956, 971],
        ]
        self.assertEqual(result, expected)
Exemplo n.º 9
0
def clustertitle( request ):
    """cluster based on title and ngram sim"""

    from cluster import HierarchicalClustering

    def sim( a, b ):
        return 1 - NGram.compare( a.title, b.title, warp=WARP, iconv=enrich )

    articles = Article.objects.filter( status = "live", date_published__gte = datetime.datetime.now() - datetime.timedelta(1) ).order_by( "date_published" )[:1000]
    cl = HierarchicalClustering(articles, sim)
    # 0.7 chosen pretty much through trial and error :)
    res = cl.getlevel(0.7)
    #import pprint
    #pprint.pprint( cl.topo() )

    clusters = []
    for cluster in res:
        if len(cluster) > 1:
            node = {
                    'type': 'cluster',
                    #'topic': longest_common_substring(cluster[0].title, cluster[1].title),
                    'topic': common_terms( [a.title for a in cluster] ),
                    'articles': cluster
                    }
        else:
            node = {
                    'type': 'article',
                    'article': cluster[0]
            }
        clusters.append(node)

    return render( request, "clusters.html", dictionary = { "clusters": clusters, } )
 def testDataTypes(self):
     "Test for bug #?"
     cl = HierarchicalClustering(self.__data, self.sim)
     for item in cl.getlevel(0.5):
         self.assertEqual(
             type(item), type([]),
             "Every item should be a list!")
def test(data, expected):
    cl = HierarchicalClustering(data, lambda x, y: abs(x-y))
    result = cl.getlevel(5)
    print(sorted(data))
    print result
    print expected
    assert result == expected
    print 'ok'
Exemplo n.º 12
0
 def testCluster(self):
     "Basic Hierarchical Clustering test with integers"
     cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y))
     cl.cluster()
     self.assertEqual(
         [[24], [84, 124, 131, 134], [336, 365, 365, 365, 398, 391],
          [940, 956, 971], [791], [835], [676], [518, 564, 542]],
         cl.getlevel(40))
def cluster(unigrams):
    DISTANCE_THRESHOLD = 0.2
    # Feed the class your data and the scoring function
    hc = HierarchicalClustering(unigrams, score)
    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)
    # Remove singleton clusters
    clusters = [c for c in clusters if len(c) > 20]
    return clusters
Exemplo n.º 14
0
 def testCluster(self):
     "Basic Hierachical clustering test with strings"
     cl = HierarchicalClustering(self.__data, self.sim)
     self.assertEqual(
         [['Nullam.'], ['Sed'], ['mi.'], ['ultricies'], ['Phasellus'],
          ['amet,', 'at'], ['sit', 'elit.', 'elit.', 'elit.'],
          ['leo', 'Lorem', 'dolor'],
          ['neque.', 'congue', 'consectetuer', 'consequat'], ['ipsum'],
          ['adipiscing']], cl.getlevel(0.5))
Exemplo n.º 15
0
    def testSingleLinkage(self):
        "Basic Hierarchical Clustering test with integers"

        def euclidian_distance(a, b):
            return sqrt(sum([pow(z[0] - z[1], 2) for z in zip(a, b)]))

        self.__data = [(1, 1), (1, 2), (1, 3)]
        cl = HierarchicalClustering(self.__data, euclidian_distance)
        result = cl.getlevel(40)
        self.assertIsNotNone(result)
Exemplo n.º 16
0
    def testSingleLinkage(self):
        "Basic Hierarchical Clustering test with integers"

        def euclidian_distance(a, b):
            return sqrt(sum([pow(z[0] - z[1], 2) for z in zip(a, b)]))

        self.__data = [(1, 1), (1, 2), (1, 3)]
        cl = HierarchicalClustering(self.__data, euclidian_distance)
        result = cl.getlevel(40)
        self.assertIsNotNone(result)
Exemplo n.º 17
0
    def testIssue28(self):
        "Issue28 (Hierarchical Clustering)"

        points1D = {
            'p4' : 5, 'p2' : 6, 'p7' : 10,
            'p9' : 120, 'p10' : 121, 'p11' : 119,
        }

        distance_func = lambda a,b : abs(points1D[a]-points1D[b])
        cl = HierarchicalClustering(list(points1D.keys()), distance_func)
        result = cl.getlevel(20)
        self.assertIsNotNone(result)
Exemplo n.º 18
0
def ml():
    global cluster_number
    input = db.session.query(ormQueue.number_of_people,
                             ormQueue.queue_name).group_by(
                                 ormQueue.queue_name).all()
    queues, n_people, cluster_array = [], [], []
    for elem in input:
        queues.append(elem.queue_name)
        n_people.append(int(elem.number_of_people))
    cl = HierarchicalClustering(n_people, lambda x, y: abs(x - y))
    res = cl.getlevel(5)
    Info = {'Queues_name': queues, 'Number_of_people': n_people}
    df = pd.DataFrame(Info, columns=['Queues_name', 'Number_of_people'])
    print(df)
    for number in range(0, len(res)):
        cluster_number = "Cluster" + str(number + 1)
        print(cluster_number)
        for elem in res[number]:
            print(elem)
            df.loc[df['Number_of_people'] == elem, 'Cluster'] = cluster_number
    print(df)
    df['randNumCol'] = np.random.randint(1, 6, df.shape[0])
    print(df)
    pearsoncorr = df.corr(method='pearson')
    print(pearsoncorr)
    X = df['Number_of_people']
    Y = df['randNumCol']
    seed = 7
    test_size = 0.25
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=test_size,
                                                        random_state=seed)
    # fit model no training data
    rf = RandomForestRegressor(n_estimators=1000, random_state=42)
    # Train the model on training data
    rf.fit([X_train], [y_train])
    # Use the forest's predict method on the test data
    predictions = rf.predict([y_train])
    # Calculate the absolute errors
    errors = abs(np.array(predictions) - np.array(X_test).reshape(-1, 1))
    # Print out the mean absolute error (mae)
    print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
    return render_template('ML.html',
                           name="Clasterization",
                           name2="Correlation",
                           name3="Regression Model",
                           tables=[df.to_html()],
                           error=errors,
                           table=[pearsoncorr.to_html()],
                           action="/ML")
Exemplo n.º 19
0
 def testCluster(self):
     "Basic Hierarchical Clustering test with integers"
     cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y))
     cl.cluster()
     self.assertEqual([
             [24],
             [84, 124, 131, 134],
             [336, 365, 365, 365, 398, 391],
             [940, 956, 971],
             [791],
             [835],
             [676],
             [518, 564, 542]],
             cl.getlevel(40))
Exemplo n.º 20
0
def hierarchical_clustering_by_title(csv_file):
    csvReader = csv.DictReader(codecs.open(csv_file, "rb", "utf-16"), delimiter='\t', quotechar='"')
    csvReader.next()
    contacts = [row for row in csvReader]

    all_titles = []
    for i, _ in enumerate(contacts):
        if contacts[i]['Current Position'] == '':
            contacts[i]['Job Titles'] = ['']
            continue
        titles = [contacts[i]['Current Position']]
        for title in titles:
            for separator in separators:
                if title.find(separator) >= 0:
                    titles.remove(title)
                    titles.extend([title.strip() for title in title.split(separator)
                                   if title.strip() != ''])

        for transform in transforms:
            titles = [title.replace(*transform) for title in titles]
        contacts[i]['Job Titles'] = titles
        all_titles.extend(titles)

    all_titles = list(set(all_titles))

    # Define a scoring function
    def score(title1, title2):
        return DISTANCE(set(title1.split()), set(title2.split()))

    # Feed the class your data and the scoring function
    hc = HierarchicalClustering(all_titles, score)

    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)

    # Remove singleton clusters
    clusters = [c for c in clusters if len(c) > 1]

    # Round up contacts who are in these clusters and group them together

    clustered_contacts = {}
    for cluster in clusters:
        clustered_contacts[tuple(cluster)] = []
        for contact in contacts:
            for title in contact['Job Titles']:
                if title in cluster:
                    clustered_contacts[tuple(cluster)].append('%s %s'
                                                              % (contact['First Name'], contact['Last Name']))

    return clustered_contacts
Exemplo n.º 21
0
def urls_clustering(urls):
    # 输入 urls
    # 计算url之间的距离
    # 使用difflib中的SequenceMatcher计算
    def distance(url1, url2):
        ratio = SequenceMatcher(None, url1, url2).ratio()
        return 1.0 - ratio

    # 执行层次聚类
    hc = HierarchicalClustering(urls, distance)
    clusters = hc.getlevel(0.2)
    # pprint.pprint(clusters)

    return clusters
Exemplo n.º 22
0
def breakToPeriods(arg, maximaOrder=20, clusteringGranularity = 0.5, file=False):
    inputAsList = []
    if(file):
        file = open(arg, 'r')
        for line in file:
            inputAsList.append(float(line))
    else:
        inputAsList = arg
    inputAsList = inputAsList if type(inputAsList) is list else inputAsList.tolist()
    a = np.array(inputAsList)
    localMax = argrelextrema(a, np.greater, 0, maximaOrder)[0].tolist()
    try:
        amplitude = np.max(a) - np.min(a)
    except:
        return []
    cl = HierarchicalClustering(a.take(localMax).tolist(), lambda x,y: abs(x-y))
    clusters = cl.getlevel(int(amplitude*clusteringGranularity))
    if(len(clusters) == 0):
        return []
    #print clusters
    max = 0
    longestSeq = None
    if(len(clusters) == len(localMax)):#It clustered every maxima differently
        longestSeq = clusters
    else:
        for cluster in clusters:
            l = len(cluster)
            if(l>max):
                longestSeq = cluster
                max = l
    #print longestSeq
    if(len(longestSeq) < 2):
        return []
    averageLength = len(inputAsList)/len(longestSeq)  
    periods = []
    indices = [inputAsList.index(x) for x in longestSeq]
    indices.sort()
    open = indices[0]
    for i in indices[1:]:
        #plt.figure()
        close = i
        strideLen = close - open
        if(strideLen > 0.5*averageLength and strideLen < 1.8*averageLength):
            period = inputAsList[open:close]
            periods.append(period)
        else:
            pass
        open = close
    return periods
Exemplo n.º 23
0
 def testCluster(self):
     "Basic Hierachical clustering test with strings"
     cl = HierarchicalClustering(self.__data, self.sim)
     self.assertEqual([
         ['ultricies'],
         ['Sed'],
         ['Phasellus'],
         ['mi'],
         ['Nullam'],
         ['sit', 'elit', 'elit', 'Ut', 'amet', 'at'],
         ['leo', 'Lorem', 'dolor'],
         ['congue', 'neque', 'consectetuer', 'consequat'],
         ['adipiscing'],
         ['ipsum'],
     ], cl.getlevel(0.5))
Exemplo n.º 24
0
def getCorners(intersections):
    cl = HierarchicalClustering(intersections, lambda p1, p2: length([p1, p2]))
    clusters = cl.getlevel(25)

    # probably want to make sure we actually have the corners at this point.
    # For now, I'm taking the 4 biggest clusters.
    cornerClusters = sorted(clusters, key=len, reverse=True)[:4]

    corners = map(averageCoords, cornerClusters)
    corners = sorted(corners, key= lambda p: p[0])
    left = sorted(corners[:2], key=lambda p: p[1])
    right = sorted(corners[2:], key=lambda p: p[1])
    #{'top-left': left[0], 'bottom-left': left[1],
    # 'top-right': right[0], 'bottom-right': right[1]}
    return left[0], left[1], right[0], right[1]
Exemplo n.º 25
0
def main():
  pC = PhamCluster()
  pC.initialize_matrix()
  #pC.calculate_distances()

  #print 'scoreMatrix:', pC.scoreMatrix
  #print 'distMatrix:', pC.distMatrix
  cl = HierarchicalClustering(pC.scoreMatrix, lambda x,y: pC.get_distance(x,y))
  #cutoff = raw_input('specify cutoff level:')
  cutoff = 1
  print 'using cutoff of 1'
  clusters = cl.getlevel(float(cutoff))
  print 'there are', len(clusters), 'clusters'
  print clusters
  print 'there are', len(clusters), 'clusters'
Exemplo n.º 26
0
 def testCluster(self):
     "Basic Hierachical clustering test with strings"
     self.skipTest('These values lead to non-deterministic results. '
                   'This makes it untestable!')
     cl = HierarchicalClustering(self.__data, self.sim)
     self.assertEqual([
         ['ultricies'],
         ['Sed'],
         ['Phasellus'],
         ['mi'],
         ['Nullam'],
         ['sit', 'elit', 'elit', 'Ut', 'amet', 'at'],
         ['leo', 'Lorem', 'dolor'],
         ['congue', 'neque', 'consectetuer', 'consequat'],
         ['adipiscing'],
         ['ipsum'],
     ], cl.getlevel(0.5))
Exemplo n.º 27
0
    def testSingleLinkage(self):
        "Basic Hierarchical Clustering test with integers"
        cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y))
        result = cl.getlevel(40)

        # sort the values to make the tests less prone to algorithm changes
        result = [sorted(_) for _ in result]
        self.assertCItemsEqual([
            [24],
            [336, 365, 365, 391, 398],
            [518, 542, 564, 594],
            [676],
            [791],
            [835],
            [84, 124, 131, 134],
            [940, 956, 971],
        ], result)
Exemplo n.º 28
0
 def testCluster(self):
     "Basic Hierachical clustering test with strings"
     self.skipTest('These values lead to non-deterministic results. '
                   'This makes it untestable!')
     cl = HierarchicalClustering(self.__data, self.sim)
     self.assertEqual([
         ['ultricies'],
         ['Sed'],
         ['Phasellus'],
         ['mi'],
         ['Nullam'],
         ['sit', 'elit', 'elit', 'Ut', 'amet', 'at'],
         ['leo', 'Lorem', 'dolor'],
         ['congue', 'neque', 'consectetuer', 'consequat'],
         ['adipiscing'],
         ['ipsum'],
     ], cl.getlevel(0.5))
Exemplo n.º 29
0
    def testSingleLinkage(self):
        "Basic Hierarchical Clustering test with integers"
        cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y))
        result = cl.getlevel(40)

        # sort the values to make the tests less prone to algorithm changes
        result = [sorted(_) for _ in result]
        self.assertCItemsEqual([
            [24],
            [336, 365, 365, 391, 398],
            [518, 542, 564, 594],
            [676],
            [791],
            [835],
            [84, 124, 131, 134],
            [940, 956, 971],
        ], result)
Exemplo n.º 30
0
def get_music_bars(filename):
	musicpage = Image.open(filename)
	pixels = musicpage.load()
	width, height = musicpage.size

	imgmat = [sum([1 for x in range(width) if pixels[x,y] == 0]) for y in range(height)]

	toplines = sorted(imgmat, reverse=True)
	tophundred = toplines[0:400]
	lineguesses = [i for i, j in enumerate(imgmat) if j in tophundred]

	cl = HierarchicalClustering(lineguesses, lambda x,y: abs(x-y))
	staves = [x for x in cl.getlevel(15) if len(x) > 2]
	bands = [[min(x), max(x)] for x in staves]
	bars = [b for b in bands if b[1] - b[0] > 20]

	return bars
Exemplo n.º 31
0
 def testUCLUS(self):
     "Basic Hierarchical Clustering test with integers"
     cl = HierarchicalClustering(self.__data,
                                 lambda x, y: abs(x - y),
                                 linkage='uclus')
     expected = [
         [24],
         [84],
         [124, 131, 134],
         [336, 365, 365, 391, 398],
         [518, 542, 564],
         [594],
         [676],
         [791],
         [835],
         [940, 956, 971],
     ]
     result = sorted([sorted(_) for _ in cl.getlevel(40)])
     self.assertEqual(result, expected)
Exemplo n.º 32
0
 def testUCLUS(self):
     "Basic Hierarchical Clustering test with integers"
     cl = HierarchicalClustering(self.__data,
                                 lambda x, y: abs(x - y),
                                 linkage='uclus')
     expected = [
         [24],
         [84],
         [124, 131, 134],
         [336, 365, 365, 391, 398],
         [518, 542, 564],
         [594],
         [676],
         [791],
         [835],
         [940, 956, 971],
     ]
     result = sorted([sorted(_) for _ in cl.getlevel(40)])
     self.assertEqual(result, expected)
Exemplo n.º 33
0
 def testAverageLinkage(self):
     cl = HierarchicalClustering(self.__data,
                                 lambda x, y: abs(x - y),
                                 linkage='average')
     # TODO: The current test-data does not really trigger a difference
     # between UCLUS and "average" linkage.
     expected = [
         [24],
         [84],
         [124, 131, 134],
         [336, 365, 365, 391, 398],
         [518, 542, 564],
         [594],
         [676],
         [791],
         [835],
         [940, 956, 971],
     ]
     result = sorted([sorted(_) for _ in cl.getlevel(40)])
     self.assertEqual(result, expected)
Exemplo n.º 34
0
 def testAverageLinkage(self):
     cl = HierarchicalClustering(self.__data,
                                 lambda x, y: abs(x - y),
                                 linkage='average')
     # TODO: The current test-data does not really trigger a difference
     # between UCLUS and "average" linkage.
     expected = [
         [24],
         [84],
         [124, 131, 134],
         [336, 365, 365, 391, 398],
         [518, 542, 564],
         [594],
         [676],
         [791],
         [835],
         [940, 956, 971],
     ]
     result = sorted([sorted(_) for _ in cl.getlevel(40)])
     self.assertEqual(result, expected)
Exemplo n.º 35
0
def hac(topic):
    """
    Use clusters.HierarchicalClustering
    https://pypi.python.org/pypi/cluster/1.1.0b1
    """
    phrases = [phrase for phrase in topic if phrase.get('es_phrase')]

    # Feed the class your data and the scoring function
    hc = HierarchicalClustering(phrases, score)

    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)
    # print "[hac]",len(clusters), json.dumps(clusters, indent=2)

    # sometimes the clustering api returns a list of dicts instead
    # of a list of lists. This causes an error in topic_extraction
    # as we are looping over the phrases
    if len(clusters) == 1 and isinstance(clusters[0], dict):
        clusters = [clusters]

    return clusters
Exemplo n.º 36
0
    def set_new_level(self, level):
        # Create the clusters
        cl = HierarchicalClustering(self._data, self._relative_levenshtein)
        clusteredData = cl.getlevel(level)

        self._parsed_clusteredData = self._parse(clusteredData)
        self._column_names = [
            'Group %d' % i for i in xrange(len(clusteredData))
        ]

        # Start with the treeview and liststore creation
        dynamicListStoreTypes = [str for i in xrange(len(self._column_names))]
        self.liststore = apply(gtk.ListStore, dynamicListStoreTypes)

        gtk.TreeView.__init__(self, self.liststore)

        # Show horizontal and vertical lines
        self.set_grid_lines(gtk.TREE_VIEW_GRID_LINES_BOTH)

        # First clear the treeview
        for col in self.get_columns():
            self.remove_column(col)

        # Internal variables
        self.current_path = None
        self.current_column = None

        self._colDict = {}
        for i, cname in enumerate(self._column_names):
            colObject = gtk.TreeViewColumn(cname)
            self.append_column(colObject)
            textRenderer = gtk.CellRendererText()
            colObject.pack_start(textRenderer, True)
            colObject.set_attributes(textRenderer, text=i)
            # Save this for later. See FIXME below.
            self._colDict[colObject] = i

        for i in self._parsed_clusteredData:
            self.liststore.append(i)
Exemplo n.º 37
0
    def set_new_level(self, level):
        # Create the clusters
        cl = HierarchicalClustering(self._data, self._relative_levenshtein)
        clusteredData = cl.getlevel(level)

        self._parsed_clusteredData = self._parse(clusteredData)
        self._column_names = ['Group %d' % i for i in xrange(len(
            clusteredData))]

        # Start with the treeview and liststore creation
        dynamicListStoreTypes = [str for i in xrange(len(self._column_names))]
        self.liststore = apply(gtk.ListStore, dynamicListStoreTypes)

        gtk.TreeView.__init__(self, self.liststore)

        # Show horizontal and vertical lines
        self.set_grid_lines(gtk.TREE_VIEW_GRID_LINES_BOTH)

        # First clear the treeview
        for col in self.get_columns():
            self.remove_column(col)

        # Internal variables
        self.current_path = None
        self.current_column = None

        self._colDict = {}
        for i, cname in enumerate(self._column_names):
            colObject = gtk.TreeViewColumn(cname)
            self.append_column(colObject)
            textRenderer = gtk.CellRendererText()
            colObject.pack_start(textRenderer, True)
            colObject.set_attributes(textRenderer, text=i)
            # Save this for later. See FIXME below.
            self._colDict[colObject] = i

        for i in self._parsed_clusteredData:
            self.liststore.append(i)
Exemplo n.º 38
0
 def testClusterLen0(self):
     """
     Testing if hierarchical clustering an empty list returns an empty list
     """
     cl = HierarchicalClustering([], lambda x, y: abs(x - y))
     self.assertEqual([], cl.getlevel(40))
Exemplo n.º 39
0
 def testUnmodifiedData(self):
     cl = HierarchicalClustering(self.__data, self.sim)
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(0.5)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
def cluster_contacts_by_title(csv_file):

    transforms = [
        ('Sr.', 'Senior'),
        ('Sr', 'Senior'),
        ('Jr.', 'Junior'),
        ('Jr', 'Junior'),
        ('CEO', 'Chief Executive Officer'),
        ('COO', 'Chief Operating Officer'),
        ('CTO', 'Chief Technology Officer'),
        ('CFO', 'Chief Finance Officer'),
        ('VP', 'Vice President'),
        ]

    separators = ['/', 'and', '&']

    csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"')
    contacts = [row for row in csvReader]

    # Normalize and/or replace known abbreviations
    # and build up list of common titles

    all_titles = []
    for i, _ in enumerate(contacts):
        if contacts[i]['Job Title'] == '':
            contacts[i]['Job Titles'] = ['']
            continue
        titles = [contacts[i]['Job Title']]
        for title in titles:
            for separator in separators:
                if title.find(separator) >= 0:
                    titles.remove(title)
                    titles.extend([title.strip() for title in title.split(separator)
                                  if title.strip() != ''])

        for transform in transforms:
            titles = [title.replace(*transform) for title in titles]
        contacts[i]['Job Titles'] = titles
        all_titles.extend(titles)

    all_titles = list(set(all_titles))

    print "Scoring...." , "\n"
    # Define a scoring function
    def score(title1, title2):
        return DISTANCE(set(title1.split()), set(title2.split()))

    # Feed the class your data and the scoring function
    hc = HierarchicalClustering(all_titles, score)

    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)

    # Remove singleton clusters
    clusters = [c for c in clusters if len(c) > 1]

    # Round up contacts who are in these clusters and group them together
    print "Clustering contacts by title...." , "\n"
    clustered_contacts = {}
    for cluster in clusters:
        clustered_contacts[tuple(cluster)] = []
        for contact in contacts:
            for title in contact['Job Titles']:
                if title in cluster:
                    clustered_contacts[tuple(cluster)].append('%s %s'
                            % (contact['First Name'], contact['Last Name']))

    return clustered_contacts
Exemplo n.º 41
0
 def testUnmodifiedData(self):
     cl = HierarchicalClustering(self.__data, self.sim)
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(0.5)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
Exemplo n.º 42
0
def test2():
        cl = HierarchicalClustering(data, lambda x, y: abs(x - y))
        new_data = []
        for row in cl.getlevel(40):
            print(row)
        print(data)
    def _align_features_cluster(self, m, rt_diff_cutoff, fdr_cutoff,
                                aligned_fdr_cutoff, method):
        """ Align features by clustering all peakgroups 

        This algorithm will find the best peakgroup cluster over all runs and
        then select all peakgroups belonging to the cluster.

        It does not treat heavy/light specially (they are treated like two independent runs).
        """

        verb = self.verbose

        if verb:
            print "00000000000000000000000000000000000 new peptide (cluster)", m.getAllPeptides(
            )[0].get_id()

        # i) get all RTs above the cutoff
        for p in m.getAllPeptides():  # loop over all peptides
            pg = p.get_best_peakgroup()
            if verb:
                print "best rt", pg.get_normalized_retentiontime(
                ), pg.peptide.run.get_id(), pg.get_fdr_score()

        groups = [
            pg for p in m.getAllPeptides()  # loop over all peptides
            for pg in p.get_all_peakgroups()  # loop over all peakgroups
            if pg.get_fdr_score() < aligned_fdr_cutoff
        ]

        # Check for empty groups
        if len(groups) == 0:
            return

        # do the clustering
        from cluster import HierarchicalClustering
        cl = HierarchicalClustering(
            groups, lambda x, y: abs(x.get_normalized_retentiontime() - y.
                                     get_normalized_retentiontime()))
        clusters_rt = cl.getlevel(
            rt_diff_cutoff)  # for large clusters, this is the the bottleneck!
        clusters_rt_obj = [Cluster(c) for c in clusters_rt]
        # if there was only one group, we need to prepare a special object of size one
        if len(groups) == 1: clusters_rt_obj = [Cluster(groups)]

        if verb: print "==== Clusters "
        # make sure only one is selected from each run...
        for i, c in enumerate(clusters_rt_obj):
            c.select_one_per_run(self.verbose)
            if verb:
                print " - Cluster with score", c.getTotalScore(), "at", \
                  c.getMedianRT(), "+/-", c.getRTstd() , "(norm_score %s)" %\
                  (float(c.getTotalScore())/((aligned_fdr_cutoff/2)**len(c.peakgroups)))
                for pg in c.peakgroups:
                    print "   = Have member", pg.print_out()

        # Get best cluster by length-normalized best score.
        #   Length normalization divides the score by the expected probability
        #   values if all peakgroups were chosen randomly (assuming equal
        #   probability between 0 and aligned_fdr_cutoff, the expected value
        #   for a random peakgroup is "aligned_fdr_cutoff/2") and thus the
        #   expected random value of n peakgroups would be (aligned_fdr_cutoff/2)^n
        bestcluster = min(clusters_rt_obj,
                          key=(lambda x: x.getTotalScore() / ((
                              (aligned_fdr_cutoff / 2)**len(c.peakgroups)))))

        clusters_rt_obj.sort(lambda x, y: cmp(
            x.getTotalScore() / ((aligned_fdr_cutoff / 2)**len(x.peakgroups)),
            y.getTotalScore() / ((aligned_fdr_cutoff / 2)**len(y.peakgroups))))
        for i, c in enumerate(clusters_rt_obj):
            for pg in c.peakgroups:
                pg.setClusterID(i + 1)
Exemplo n.º 44
0
 def testUnmodifiedData(self):
     cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y))
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(40)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
Exemplo n.º 45
0
 def testClusterLen0(self):
     """
     Testing if hierarchical clustering an empty list returns an empty list
     """
     cl = HierarchicalClustering([], lambda x, y: abs(x - y))
     self.assertEqual([], cl.getlevel(40))
Exemplo n.º 46
0
#pt.printt()

#sort_list = fdist.keys()
#print sort_list


print "Clustering Musics"      

# Define a scoring function
def score(music1, music2):
  return DISTANCE(set(music1), set(music2))

# Feed the class your data and the scoring function
hc = HierarchicalClustering(musics, score)
# Cluster the data according to a distance threshold
clusters = hc.getlevel(DISTANCE_THRESHOLD)

# Remove singleton clusters
clusters = [c for c in clusters if len(c) > 1]


######## End: HAC ########

# Round up musics who are in these clusters and group them together

clustered_musics = {}

for cluster in clusters:
  clustered_musics[tuple(cluster)] = []
  for idx, music in enumerate(musics):
    for tag in music:
Exemplo n.º 47
0
######## Begin: HAC ########

# Define a scoring function


def score(title1, title2):
    return DISTANCE(set(title1.split()), set(title2.split()))


# Feed the class your data and the scoring function

hc = HierarchicalClustering(all_titles, score)

# Cluster the data according to a distance threshold

clusters = hc.getlevel(DISTANCE_THRESHOLD)

# Remove singleton clusters
# clusters = [c for c in clusters if len(c) > 1]

######## End: HAC ########

# Round up contacts who are in these clusters and group them together

clustered_contacts = {}
for cluster in clusters:
    clustered_contacts[tuple(cluster)] = []
    for contact in contacts:
        for title in contact['Job Titles']:
            if title in cluster:
                clustered_contacts[tuple(cluster)].append(
Exemplo n.º 48
0
data = [24, 84, 124, 131, 134, 336, 365, 365, 391, 398, 518, 542, 564, 594, 676,
        791, 835, 940, 956, 971]
data2 = [791, 956, 676, 124, 564, 84, 24, 365, 594, 940, 398, 971, 131, 365, 542,
         336, 518, 835, 134, 391]

def test2():
        cl = HierarchicalClustering(data, lambda x, y: abs(x - y))
        new_data = []
        for row in cl.getlevel(40):
            print(row)
        print(data)
        #[new_data.extend(_) for _ in cl.getlevel(40)]
        #self.assertEqual(sorted(new_data), sorted(self.__data))

def run(level):
    print('Level = {}'.format(level))
    cluster = HierarchicalClustering(data, lambda x, y: abs(x-y))
    result = cluster.getlevel(level)
    for row in result:
        print(row)

print(data)
run(40)
#print(len(data))

#test2()

cl = HierarchicalClustering(data, lambda x, y: abs(x - y))
cl.getlevel(40)
print(sorted(data) == sorted(data2))
Exemplo n.º 49
0
 def testMultiprocessing(self):
     cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y),
                                 num_processes=4)
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(40)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
Exemplo n.º 50
0
n_sequence = zip(*n_sequence)
n_sequence = [''.join(i) for i in n_sequence]
n_sequence.sort(key=len, reverse=True)

p_size = 0
new_seq = []
chunk = []
for seq in n_sequence:
    size = len(seq)
    if size == p_size:
        new_seq.append(seq)
    else:
        p_size = size
        if chunk:
            cl = HierarchicalClustering(chunk, lambda x, y: distance(x, y))
            cl.getlevel(1)
            new_seq += cl
            print(new_seq)
            chunk = []

print(len(n_sequence))
print(len(identifiers))
out.write(bytes(''.join(identifiers), 'UTF-8'))
out.write(bytes('\n', 'UTF-8'))
out.write(bytes('\n'.join(n_sequence), 'UTF-8'))
out.write(bytes('\n', 'UTF-8'))
out.close()

in_size = os.path.getsize(in_file)
out_size = os.path.getsize(out_file)
Exemplo n.º 51
0
 def testUnmodifiedData(self):
     cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y))
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(40)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
Exemplo n.º 52
0
def cluster_contacts_by_title(csv_file):
    def score(title1, title2):

        return DISTANCE((title1), (title2))

    all_titles = []
    all_titles.append("Student")
    all_titles.append("Assistant Professor")
    all_titles.append("Student Ambassador")
    all_titles.append("Assistant Developer")
    all_titles.append("Human Resources")
    all_titles.append("Software Developer")
    all_titles.append("Head, Technical Affairs-Software")
    all_titles.append("Sofware Engineer")

    all_titles.append("Software Engineer")
    all_titles.append("Design Secretary")
    all_titles.append("Telesales Executive")
    all_titles.append("Filmmaker")
    all_titles.append("Writer")
    all_titles.append("Data Developer")
    all_titles.append("Software Developmer")
    all_titles.append("Co-founder")

    all_titles.append("Assistant Manager")

    all_titles.append("Management Trainee - Operations")
    all_titles.append("Oracle Database Administrator")
    all_titles.append("Key Account Manager")
    all_titles.append("Engineering Manager")
    all_titles.append("Talent Acquisition Manager")
    all_titles.append("Wireless Protocol Test Intern")

    all_titles.append("HR Executive")
    all_titles.append("IT Company")
    all_titles.append("Business Development Manager")
    all_titles.append("Member of Technical Staff")
    all_titles.append("Web Designer")
    all_titles.append("ECE Student")
    all_titles.append("Intern")
    all_titles.append("Head of Growth")

    all_titles.append("SA")
    all_titles.append("Manager (Technology)")
    all_titles.append("Systems Engineer")
    all_titles.append("Technical Team Member")
    all_titles.append("Business Developer")
    all_titles.append("system engineer")
    all_titles.append("Infrastructure Developer")
    all_titles.append("Engineer")
    all_titles.append("Mechanical Engineer")
    all_titles.append("Student Technical Assistant")
    all_titles.append("Senior Software Engineer")
    all_titles.append("Senior Software Developer")
    all_titles.append("Associate Professor")
    all_titles.append("Professor")
    all_titles.append("Software developer")
    all_titles.append("Director - Software Engineering")
    all_titles.append("Product Manager")

    hc = HierarchicalClustering(all_titles, score)

    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)
    print clusters

    score_matrix = []
    min_d = 1000000
    for title1 in all_titles:
        temp = []
        for title2 in all_titles:
            li1 = title1.split(",")
            li2 = title2.split(",")
            for ll1 in li1:
                min_d = 100000
                for ll2 in li2:
                    #  print ll1,ll2
                    d = score(ll1, ll2)
                    # print d
                    min_d = min(min_d, d)
                #print "done"

                #   print d
            temp.append(min_d)
        score_matrix.append(temp)
#  print score_matrix
    print len(all_titles)

    i = j = k = l = 0
    mini = 10000
    for l1 in score_matrix:
        j = 0
        for l2 in l1:
            if l2 < mini and i != j:
                mini = l2
                k = i
                l = j
            j = j + 1
        i = i + 1


#    print "%d %d",(k,l)
#    print mini
    clusters = [c for c in clusters if len(c) > 1]
    # print clusters
    # Round up contacts who are in these clusters and group them together
    transforms = [
        ('Sr.', 'Senior'),
        ('Sr', 'Senior'),
        ('Jr.', 'Junior'),
        ('Jr', 'Junior'),
        ('CEO', 'Chief Executive Officer'),
        ('COO', 'Chief Operating Officer'),
        ('CTO', 'Chief Technology Officer'),
        ('CFO', 'Chief Finance Officer'),
        ('VP', 'Vice President'),
    ]

    separators = ['/', 'and', '&']

    csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"')
    contacts = [row for row in csvReader]

    all_titles = []
    for i, _ in enumerate(contacts):
        if contacts[i]['Job Title'] == '':
            contacts[i]['Job Titles'] = ['']
            continue
        titles = [contacts[i]['Job Title']]
        for title in titles:
            for separator in separators:
                if title.find(separator) >= 0:
                    titles.remove(title)
                    titles.extend([
                        title.strip() for title in title.split(separator)
                        if title.strip() != ''
                    ])

        for transform in transforms:
            titles = [title.replace(*transform) for title in titles]
        contacts[i]['Job Titles'] = titles
    clustered_contacts = {}
    for cluster in clusters:
        clustered_contacts[tuple(cluster)] = []
        for contact in contacts:
            for title in contact['Job Titles']:
                if title in cluster:
                    clustered_contacts[tuple(cluster)].append(
                        '%s %s ' %
                        (contact['First Name'], contact['Last Name']))

    return clustered_contacts
Exemplo n.º 53
0
def cluster_contacts_by_title(csv_file):

    transforms = [
        ('Sr.', 'Senior'),
        ('Sr', 'Senior'),
        ('Jr.', 'Junior'),
        ('Jr', 'Junior'),
        ('CEO', 'Chief Executive Officer'),
        ('COO', 'Chief Operating Officer'),
        ('CTO', 'Chief Technology Officer'),
        ('CFO', 'Chief Finance Officer'),
        ('VP', 'Vice President'),
    ]

    separators = ['/', 'and', '&']

    csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"')
    contacts = [row for row in csvReader]

    # Normalize and/or replace known abbreviations
    # and build up list of common titles

    all_titles = []
    for i, _ in enumerate(contacts):
        if contacts[i]['Job Title'] == '':
            contacts[i]['Job Titles'] = ['']
            continue
        titles = [contacts[i]['Job Title']]
        for title in titles:
            for separator in separators:
                if title.find(separator) >= 0:
                    titles.remove(title)
                    titles.extend([
                        title.strip() for title in title.split(separator)
                        if title.strip() != ''
                    ])

        for transform in transforms:
            titles = [title.replace(*transform) for title in titles]
        contacts[i]['Job Titles'] = titles
        all_titles.extend(titles)

    all_titles = list(set(all_titles))

    print "Scoring....", "\n"

    # Define a scoring function
    def score(title1, title2):
        return DISTANCE(set(title1.split()), set(title2.split()))

    # Feed the class your data and the scoring function
    hc = HierarchicalClustering(all_titles, score)

    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)

    # Remove singleton clusters
    clusters = [c for c in clusters if len(c) > 1]

    # Round up contacts who are in these clusters and group them together
    print "Clustering contacts by title....", "\n"
    clustered_contacts = {}
    for cluster in clusters:
        clustered_contacts[tuple(cluster)] = []
        for contact in contacts:
            for title in contact['Job Titles']:
                if title in cluster:
                    clustered_contacts[tuple(cluster)].append(
                        '%s %s' %
                        (contact['First Name'], contact['Last Name']))

    return clustered_contacts
Exemplo n.º 54
0
def run(level):
    print('Level = {}'.format(level))
    cluster = HierarchicalClustering(data, lambda x, y: abs(x-y))
    result = cluster.getlevel(level)
    for row in result:
        print(row)