Пример #1
0
    def testCompleteLinkage(self):
        "Basic Hierarchical Clustering test with integers"
        cl = HierarchicalClustering(self.__data,
                                    lambda x, y: abs(x - y),
                                    linkage='complete')
        result = cl.getlevel(40)

        # sort the values to make the tests less prone to algorithm changes
        result = sorted([sorted(_) for _ in result])

        expected = [
            [24],
            [84],
            [124, 131, 134],
            [336, 365, 365],
            [391, 398],
            [518],
            [542, 564],
            [594],
            [676],
            [791],
            [835],
            [940, 956, 971],
        ]
        self.assertEqual(result, expected)
 def testDataTypes(self):
     "Test for bug #?"
     cl = HierarchicalClustering(self.__data, self.sim)
     for item in cl.getlevel(0.5):
         self.assertEqual(
             type(item), type([]),
             "Every item should be a list!")
Пример #3
0
def buildHcluster(data, threshold):
	"""
	Description:Build Hierachical Cluster
	Input:
			data: e.g. data = [	[12,12],[34,34],
								[23,23],[32,32],
								[46,46],[96,96],
								[13,13],[1,1],
								[4,4],[9,9]] 
								# The first variable is key, not counted for clustering
			threshold: threshold distance to break cluster
	Output: cluster record file  /searchc/save/H.cluster
	"""
	print "Clustering..."
	a = datetime.datetime.now()
	cl = HierarchicalClustering(data,distance_function,'complete')
	clusterH =  cl.getlevel(threshold)	     			# get h clusters
	b = datetime.datetime.now()
	print "Naming..."
	featureAll = readFeature('all')
	c = nameCluster(clusterH,featureAll)
	name = c[0]
	centroid = c[1]
	writeCluster('H',clusterH,name,centroid,threshold)
	print "Writing..."
	with open(path+'/log/H_'+str(threshold)+'.log','w') as outfile:
		outfile.write("Hierahical Clustering Log\nDate:\t"+str(a.date())+"\nStart:\t"+str(a.time())+"\nEnd:\t"+str(b.time())+"\nDuration:\t"+str(b-a)+"\nH:\t"+str(threshold)+"\nMethod:\tComplete"+"\nNo. cluster:\t"+str(len(clusterH))+"\n\n")
		for cluster in clusterH:
			outfile.write(str(len(cluster)-2)+"\n")
	
	return
Пример #4
0
 def testClusterLen1(self):
     """
     Testing if hierarchical clustering a set of length 1 returns a set of
     length 1
     """
     cl = HierarchicalClustering([876], lambda x, y: abs(x - y))
     self.assertEqual([876], cl.getlevel(40))
Пример #5
0
 def testDataTypes(self):
     "Test for bug #?"
     cl = HierarchicalClustering(self.__data, self.sim)
     for item in cl.getlevel(0.5):
         self.assertEqual(
                 type(item), type([]),
                 "Every item should be a list!")
Пример #6
0
def getSubbatch(images, image_labels, similar_thred):
    sizes = [(image.shape[0], image.shape[1], idx)
             for idx, image in enumerate(images)]
    cl = HierarchicalClustering(
        sizes, lambda x, y: abs(x[0] - y[0]) + abs(x[1] - y[1]))
    clusters = cl.getlevel(similar_thred)
    subbatches = []
    sorted(clusters, key=lambda cluster: len(cluster))
    for cluster in clusters:
        if len(cluster) > 1:
            ideal_size = np.median(cluster, axis=0)
            ideal_size = [int(i) for i in ideal_size]
            subbatch_im = []
            subbatch_label = []
            for img in cluster:
                if img[0] != ideal_size[0] or img[1] != ideal_size[1]:
                    subbatch_im.append(
                        cv2.resize(images[img[2]],
                                   (ideal_size[1], ideal_size[0])))
                    subbatch_label.append(
                        cv2.resize(image_labels[img[2]],
                                   (ideal_size[1], ideal_size[0])))
                else:
                    subbatch_im.append(images[img[2]])
                    subbatch_label.append(image_labels[img[2]])
            subbatches.append({
                'images': np.array(subbatch_im),
                'labels': np.array(subbatch_label)
            })
        else:
            subbatches.append({
                'images': np.array([images[cluster[0][2]]]),
                'labels': np.array([image_labels[cluster[0][2]]])
            })
    return subbatches
Пример #7
0
def clustertitle( request ):
    """cluster based on title and ngram sim"""

    from cluster import HierarchicalClustering

    def sim( a, b ):
        return 1 - NGram.compare( a.title, b.title, warp=WARP, iconv=enrich )

    articles = Article.objects.filter( status = "live", date_published__gte = datetime.datetime.now() - datetime.timedelta(1) ).order_by( "date_published" )[:1000]
    cl = HierarchicalClustering(articles, sim)
    # 0.7 chosen pretty much through trial and error :)
    res = cl.getlevel(0.7)
    #import pprint
    #pprint.pprint( cl.topo() )

    clusters = []
    for cluster in res:
        if len(cluster) > 1:
            node = {
                    'type': 'cluster',
                    #'topic': longest_common_substring(cluster[0].title, cluster[1].title),
                    'topic': common_terms( [a.title for a in cluster] ),
                    'articles': cluster
                    }
        else:
            node = {
                    'type': 'article',
                    'article': cluster[0]
            }
        clusters.append(node)

    return render( request, "clusters.html", dictionary = { "clusters": clusters, } )
Пример #8
0
 def testCluster(self):
     "Basic Hierarchical Clustering test with integers"
     cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y))
     cl.cluster()
     self.assertEqual(
         [[24], [84, 124, 131, 134], [336, 365, 365, 365, 398, 391],
          [940, 956, 971], [791], [835], [676], [518, 564, 542]],
         cl.getlevel(40))
def test(data, expected):
    cl = HierarchicalClustering(data, lambda x, y: abs(x-y))
    result = cl.getlevel(5)
    print(sorted(data))
    print result
    print expected
    assert result == expected
    print 'ok'
Пример #10
0
 def testCluster(self):
     "Basic Hierachical clustering test with strings"
     cl = HierarchicalClustering(self.__data, self.sim)
     self.assertEqual(
         [['Nullam.'], ['Sed'], ['mi.'], ['ultricies'], ['Phasellus'],
          ['amet,', 'at'], ['sit', 'elit.', 'elit.', 'elit.'],
          ['leo', 'Lorem', 'dolor'],
          ['neque.', 'congue', 'consectetuer', 'consequat'], ['ipsum'],
          ['adipiscing']], cl.getlevel(0.5))
def cluster(unigrams):
    DISTANCE_THRESHOLD = 0.2
    # Feed the class your data and the scoring function
    hc = HierarchicalClustering(unigrams, score)
    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)
    # Remove singleton clusters
    clusters = [c for c in clusters if len(c) > 20]
    return clusters
Пример #12
0
    def testSingleLinkage(self):
        "Basic Hierarchical Clustering test with integers"

        def euclidian_distance(a, b):
            return sqrt(sum([pow(z[0] - z[1], 2) for z in zip(a, b)]))

        self.__data = [(1, 1), (1, 2), (1, 3)]
        cl = HierarchicalClustering(self.__data, euclidian_distance)
        result = cl.getlevel(40)
        self.assertIsNotNone(result)
Пример #13
0
    def testIssue28(self):
        "Issue28 (Hierarchical Clustering)"

        points1D = {
            'p4' : 5, 'p2' : 6, 'p7' : 10,
            'p9' : 120, 'p10' : 121, 'p11' : 119,
        }

        distance_func = lambda a,b : abs(points1D[a]-points1D[b])
        cl = HierarchicalClustering(list(points1D.keys()), distance_func)
        result = cl.getlevel(20)
        self.assertIsNotNone(result)
Пример #14
0
def ml():
    global cluster_number
    input = db.session.query(ormQueue.number_of_people,
                             ormQueue.queue_name).group_by(
                                 ormQueue.queue_name).all()
    queues, n_people, cluster_array = [], [], []
    for elem in input:
        queues.append(elem.queue_name)
        n_people.append(int(elem.number_of_people))
    cl = HierarchicalClustering(n_people, lambda x, y: abs(x - y))
    res = cl.getlevel(5)
    Info = {'Queues_name': queues, 'Number_of_people': n_people}
    df = pd.DataFrame(Info, columns=['Queues_name', 'Number_of_people'])
    print(df)
    for number in range(0, len(res)):
        cluster_number = "Cluster" + str(number + 1)
        print(cluster_number)
        for elem in res[number]:
            print(elem)
            df.loc[df['Number_of_people'] == elem, 'Cluster'] = cluster_number
    print(df)
    df['randNumCol'] = np.random.randint(1, 6, df.shape[0])
    print(df)
    pearsoncorr = df.corr(method='pearson')
    print(pearsoncorr)
    X = df['Number_of_people']
    Y = df['randNumCol']
    seed = 7
    test_size = 0.25
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=test_size,
                                                        random_state=seed)
    # fit model no training data
    rf = RandomForestRegressor(n_estimators=1000, random_state=42)
    # Train the model on training data
    rf.fit([X_train], [y_train])
    # Use the forest's predict method on the test data
    predictions = rf.predict([y_train])
    # Calculate the absolute errors
    errors = abs(np.array(predictions) - np.array(X_test).reshape(-1, 1))
    # Print out the mean absolute error (mae)
    print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
    return render_template('ML.html',
                           name="Clasterization",
                           name2="Correlation",
                           name3="Regression Model",
                           tables=[df.to_html()],
                           error=errors,
                           table=[pearsoncorr.to_html()],
                           action="/ML")
Пример #15
0
 def testCluster(self):
     "Basic Hierarchical Clustering test with integers"
     cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y))
     cl.cluster()
     self.assertEqual([
             [24],
             [84, 124, 131, 134],
             [336, 365, 365, 365, 398, 391],
             [940, 956, 971],
             [791],
             [835],
             [676],
             [518, 564, 542]],
             cl.getlevel(40))
Пример #16
0
def urls_clustering(urls):
    # 输入 urls
    # 计算url之间的距离
    # 使用difflib中的SequenceMatcher计算
    def distance(url1, url2):
        ratio = SequenceMatcher(None, url1, url2).ratio()
        return 1.0 - ratio

    # 执行层次聚类
    hc = HierarchicalClustering(urls, distance)
    clusters = hc.getlevel(0.2)
    # pprint.pprint(clusters)

    return clusters
Пример #17
0
def hierarchical_clustering_by_title(csv_file):
    csvReader = csv.DictReader(codecs.open(csv_file, "rb", "utf-16"), delimiter='\t', quotechar='"')
    csvReader.next()
    contacts = [row for row in csvReader]

    all_titles = []
    for i, _ in enumerate(contacts):
        if contacts[i]['Current Position'] == '':
            contacts[i]['Job Titles'] = ['']
            continue
        titles = [contacts[i]['Current Position']]
        for title in titles:
            for separator in separators:
                if title.find(separator) >= 0:
                    titles.remove(title)
                    titles.extend([title.strip() for title in title.split(separator)
                                   if title.strip() != ''])

        for transform in transforms:
            titles = [title.replace(*transform) for title in titles]
        contacts[i]['Job Titles'] = titles
        all_titles.extend(titles)

    all_titles = list(set(all_titles))

    # Define a scoring function
    def score(title1, title2):
        return DISTANCE(set(title1.split()), set(title2.split()))

    # Feed the class your data and the scoring function
    hc = HierarchicalClustering(all_titles, score)

    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)

    # Remove singleton clusters
    clusters = [c for c in clusters if len(c) > 1]

    # Round up contacts who are in these clusters and group them together

    clustered_contacts = {}
    for cluster in clusters:
        clustered_contacts[tuple(cluster)] = []
        for contact in contacts:
            for title in contact['Job Titles']:
                if title in cluster:
                    clustered_contacts[tuple(cluster)].append('%s %s'
                                                              % (contact['First Name'], contact['Last Name']))

    return clustered_contacts
Пример #18
0
def breakToPeriods(arg, maximaOrder=20, clusteringGranularity = 0.5, file=False):
    inputAsList = []
    if(file):
        file = open(arg, 'r')
        for line in file:
            inputAsList.append(float(line))
    else:
        inputAsList = arg
    inputAsList = inputAsList if type(inputAsList) is list else inputAsList.tolist()
    a = np.array(inputAsList)
    localMax = argrelextrema(a, np.greater, 0, maximaOrder)[0].tolist()
    try:
        amplitude = np.max(a) - np.min(a)
    except:
        return []
    cl = HierarchicalClustering(a.take(localMax).tolist(), lambda x,y: abs(x-y))
    clusters = cl.getlevel(int(amplitude*clusteringGranularity))
    if(len(clusters) == 0):
        return []
    #print clusters
    max = 0
    longestSeq = None
    if(len(clusters) == len(localMax)):#It clustered every maxima differently
        longestSeq = clusters
    else:
        for cluster in clusters:
            l = len(cluster)
            if(l>max):
                longestSeq = cluster
                max = l
    #print longestSeq
    if(len(longestSeq) < 2):
        return []
    averageLength = len(inputAsList)/len(longestSeq)  
    periods = []
    indices = [inputAsList.index(x) for x in longestSeq]
    indices.sort()
    open = indices[0]
    for i in indices[1:]:
        #plt.figure()
        close = i
        strideLen = close - open
        if(strideLen > 0.5*averageLength and strideLen < 1.8*averageLength):
            period = inputAsList[open:close]
            periods.append(period)
        else:
            pass
        open = close
    return periods
Пример #19
0
def getCorners(intersections):
    cl = HierarchicalClustering(intersections, lambda p1, p2: length([p1, p2]))
    clusters = cl.getlevel(25)

    # probably want to make sure we actually have the corners at this point.
    # For now, I'm taking the 4 biggest clusters.
    cornerClusters = sorted(clusters, key=len, reverse=True)[:4]

    corners = map(averageCoords, cornerClusters)
    corners = sorted(corners, key= lambda p: p[0])
    left = sorted(corners[:2], key=lambda p: p[1])
    right = sorted(corners[2:], key=lambda p: p[1])
    #{'top-left': left[0], 'bottom-left': left[1],
    # 'top-right': right[0], 'bottom-right': right[1]}
    return left[0], left[1], right[0], right[1]
Пример #20
0
 def testCluster(self):
     "Basic Hierachical clustering test with strings"
     cl = HierarchicalClustering(self.__data, self.sim)
     self.assertEqual([
         ['ultricies'],
         ['Sed'],
         ['Phasellus'],
         ['mi'],
         ['Nullam'],
         ['sit', 'elit', 'elit', 'Ut', 'amet', 'at'],
         ['leo', 'Lorem', 'dolor'],
         ['congue', 'neque', 'consectetuer', 'consequat'],
         ['adipiscing'],
         ['ipsum'],
     ], cl.getlevel(0.5))
Пример #21
0
def main():
  pC = PhamCluster()
  pC.initialize_matrix()
  #pC.calculate_distances()

  #print 'scoreMatrix:', pC.scoreMatrix
  #print 'distMatrix:', pC.distMatrix
  cl = HierarchicalClustering(pC.scoreMatrix, lambda x,y: pC.get_distance(x,y))
  #cutoff = raw_input('specify cutoff level:')
  cutoff = 1
  print 'using cutoff of 1'
  clusters = cl.getlevel(float(cutoff))
  print 'there are', len(clusters), 'clusters'
  print clusters
  print 'there are', len(clusters), 'clusters'
Пример #22
0
 def testCluster(self):
     "Basic Hierachical clustering test with strings"
     self.skipTest('These values lead to non-deterministic results. '
                   'This makes it untestable!')
     cl = HierarchicalClustering(self.__data, self.sim)
     self.assertEqual([
         ['ultricies'],
         ['Sed'],
         ['Phasellus'],
         ['mi'],
         ['Nullam'],
         ['sit', 'elit', 'elit', 'Ut', 'amet', 'at'],
         ['leo', 'Lorem', 'dolor'],
         ['congue', 'neque', 'consectetuer', 'consequat'],
         ['adipiscing'],
         ['ipsum'],
     ], cl.getlevel(0.5))
Пример #23
0
def get_music_bars(filename):
	musicpage = Image.open(filename)
	pixels = musicpage.load()
	width, height = musicpage.size

	imgmat = [sum([1 for x in range(width) if pixels[x,y] == 0]) for y in range(height)]

	toplines = sorted(imgmat, reverse=True)
	tophundred = toplines[0:400]
	lineguesses = [i for i, j in enumerate(imgmat) if j in tophundred]

	cl = HierarchicalClustering(lineguesses, lambda x,y: abs(x-y))
	staves = [x for x in cl.getlevel(15) if len(x) > 2]
	bands = [[min(x), max(x)] for x in staves]
	bars = [b for b in bands if b[1] - b[0] > 20]

	return bars
Пример #24
0
    def testSingleLinkage(self):
        "Basic Hierarchical Clustering test with integers"
        cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y))
        result = cl.getlevel(40)

        # sort the values to make the tests less prone to algorithm changes
        result = [sorted(_) for _ in result]
        self.assertCItemsEqual([
            [24],
            [336, 365, 365, 391, 398],
            [518, 542, 564, 594],
            [676],
            [791],
            [835],
            [84, 124, 131, 134],
            [940, 956, 971],
        ], result)
Пример #25
0
 def testUCLUS(self):
     "Basic Hierarchical Clustering test with integers"
     cl = HierarchicalClustering(self.__data,
                                 lambda x, y: abs(x - y),
                                 linkage='uclus')
     expected = [
         [24],
         [84],
         [124, 131, 134],
         [336, 365, 365, 391, 398],
         [518, 542, 564],
         [594],
         [676],
         [791],
         [835],
         [940, 956, 971],
     ]
     result = sorted([sorted(_) for _ in cl.getlevel(40)])
     self.assertEqual(result, expected)
Пример #26
0
 def testAverageLinkage(self):
     cl = HierarchicalClustering(self.__data,
                                 lambda x, y: abs(x - y),
                                 linkage='average')
     # TODO: The current test-data does not really trigger a difference
     # between UCLUS and "average" linkage.
     expected = [
         [24],
         [84],
         [124, 131, 134],
         [336, 365, 365, 391, 398],
         [518, 542, 564],
         [594],
         [676],
         [791],
         [835],
         [940, 956, 971],
     ]
     result = sorted([sorted(_) for _ in cl.getlevel(40)])
     self.assertEqual(result, expected)
Пример #27
0
def hac(topic):
    """
    Use clusters.HierarchicalClustering
    https://pypi.python.org/pypi/cluster/1.1.0b1
    """
    phrases = [phrase for phrase in topic if phrase.get('es_phrase')]

    # Feed the class your data and the scoring function
    hc = HierarchicalClustering(phrases, score)

    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)
    # print "[hac]",len(clusters), json.dumps(clusters, indent=2)

    # sometimes the clustering api returns a list of dicts instead
    # of a list of lists. This causes an error in topic_extraction
    # as we are looping over the phrases
    if len(clusters) == 1 and isinstance(clusters[0], dict):
        clusters = [clusters]

    return clusters
Пример #28
0
    def set_new_level(self, level):
        # Create the clusters
        cl = HierarchicalClustering(self._data, self._relative_levenshtein)
        clusteredData = cl.getlevel(level)

        self._parsed_clusteredData = self._parse(clusteredData)
        self._column_names = [
            'Group %d' % i for i in xrange(len(clusteredData))
        ]

        # Start with the treeview and liststore creation
        dynamicListStoreTypes = [str for i in xrange(len(self._column_names))]
        self.liststore = apply(gtk.ListStore, dynamicListStoreTypes)

        gtk.TreeView.__init__(self, self.liststore)

        # Show horizontal and vertical lines
        self.set_grid_lines(gtk.TREE_VIEW_GRID_LINES_BOTH)

        # First clear the treeview
        for col in self.get_columns():
            self.remove_column(col)

        # Internal variables
        self.current_path = None
        self.current_column = None

        self._colDict = {}
        for i, cname in enumerate(self._column_names):
            colObject = gtk.TreeViewColumn(cname)
            self.append_column(colObject)
            textRenderer = gtk.CellRendererText()
            colObject.pack_start(textRenderer, True)
            colObject.set_attributes(textRenderer, text=i)
            # Save this for later. See FIXME below.
            self._colDict[colObject] = i

        for i in self._parsed_clusteredData:
            self.liststore.append(i)
Пример #29
0
    def set_new_level(self, level):
        # Create the clusters
        cl = HierarchicalClustering(self._data, self._relative_levenshtein)
        clusteredData = cl.getlevel(level)

        self._parsed_clusteredData = self._parse(clusteredData)
        self._column_names = ['Group %d' % i for i in xrange(len(
            clusteredData))]

        # Start with the treeview and liststore creation
        dynamicListStoreTypes = [str for i in xrange(len(self._column_names))]
        self.liststore = apply(gtk.ListStore, dynamicListStoreTypes)

        gtk.TreeView.__init__(self, self.liststore)

        # Show horizontal and vertical lines
        self.set_grid_lines(gtk.TREE_VIEW_GRID_LINES_BOTH)

        # First clear the treeview
        for col in self.get_columns():
            self.remove_column(col)

        # Internal variables
        self.current_path = None
        self.current_column = None

        self._colDict = {}
        for i, cname in enumerate(self._column_names):
            colObject = gtk.TreeViewColumn(cname)
            self.append_column(colObject)
            textRenderer = gtk.CellRendererText()
            colObject.pack_start(textRenderer, True)
            colObject.set_attributes(textRenderer, text=i)
            # Save this for later. See FIXME below.
            self._colDict[colObject] = i

        for i in self._parsed_clusteredData:
            self.liststore.append(i)
def run_clustering(repository_parameter_name, start_date, end_date, limit):
    """

    :param repository_parameter_name: One of the types from ``RepositoryParameter``
    :param start_date: First day
    :param end_date: Last day
    :param limit: Limit the number of examined stations
    :return: Show clustering
    """
    params = get_repository_parameters(repository_parameter_name)
    station_repository = StationRepository(*params)
    station_dicts = station_repository.load_all_stations(start_date,
                                                         end_date,
                                                         limit=limit)
    station_time_series_comparator = StationTimeSeriesComparator(station_dicts)
    stations = [Station(station_dict) for station_dict in station_dicts]

    cluster = HierarchicalClustering(
        stations,
        station_time_series_comparator.compare_time_series,
        num_processes=4)
    cluster.cluster()
    cluster.display(print_function=logging.debug)
    logging.info(cluster._data)
Пример #31
0
 def testMultiprocessing(self):
     cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y),
                                 num_processes=4)
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(40)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
Пример #32
0
#[pt.add_row([tag, freq]) for (tag, freq) in fdist.items() if freq > 1]
##[:50]
#pt.printt()

#sort_list = fdist.keys()
#print sort_list


print "Clustering Musics"      

# Define a scoring function
def score(music1, music2):
  return DISTANCE(set(music1), set(music2))

# Feed the class your data and the scoring function
hc = HierarchicalClustering(musics, score)
# Cluster the data according to a distance threshold
clusters = hc.getlevel(DISTANCE_THRESHOLD)

# Remove singleton clusters
clusters = [c for c in clusters if len(c) > 1]


######## End: HAC ########

# Round up musics who are in these clusters and group them together

clustered_musics = {}

for cluster in clusters:
  clustered_musics[tuple(cluster)] = []
Пример #33
0
def test2():
        cl = HierarchicalClustering(data, lambda x, y: abs(x - y))
        new_data = []
        for row in cl.getlevel(40):
            print(row)
        print(data)
Пример #34
0
def run(level):
    print('Level = {}'.format(level))
    cluster = HierarchicalClustering(data, lambda x, y: abs(x-y))
    result = cluster.getlevel(level)
    for row in result:
        print(row)
Пример #35
0
data = [24, 84, 124, 131, 134, 336, 365, 365, 391, 398, 518, 542, 564, 594, 676,
        791, 835, 940, 956, 971]
data2 = [791, 956, 676, 124, 564, 84, 24, 365, 594, 940, 398, 971, 131, 365, 542,
         336, 518, 835, 134, 391]

def test2():
        cl = HierarchicalClustering(data, lambda x, y: abs(x - y))
        new_data = []
        for row in cl.getlevel(40):
            print(row)
        print(data)
        #[new_data.extend(_) for _ in cl.getlevel(40)]
        #self.assertEqual(sorted(new_data), sorted(self.__data))

def run(level):
    print('Level = {}'.format(level))
    cluster = HierarchicalClustering(data, lambda x, y: abs(x-y))
    result = cluster.getlevel(level)
    for row in result:
        print(row)

print(data)
run(40)
#print(len(data))

#test2()

cl = HierarchicalClustering(data, lambda x, y: abs(x - y))
cl.getlevel(40)
print(sorted(data) == sorted(data2))
Пример #36
0
 def testUnmodifiedData(self):
     cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y))
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(40)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
Пример #37
0
 def testClusterLen0(self):
     """
     Testing if hierarchical clustering an empty list returns an empty list
     """
     cl = HierarchicalClustering([], lambda x, y: abs(x - y))
     self.assertEqual([], cl.getlevel(40))
Пример #38
0
 def testUnmodifiedData(self):
     cl = HierarchicalClustering(self.__data, self.sim)
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(0.5)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
Пример #39
0
# print(n_sequence)
n_sequence = zip(*n_sequence)
n_sequence = [''.join(i) for i in n_sequence]
n_sequence.sort(key=len, reverse=True)

p_size = 0
new_seq = []
chunk = []
for seq in n_sequence:
    size = len(seq)
    if size == p_size:
        new_seq.append(seq)
    else:
        p_size = size
        if chunk:
            cl = HierarchicalClustering(chunk, lambda x, y: distance(x, y))
            cl.getlevel(1)
            new_seq += cl
            print(new_seq)
            chunk = []

print(len(n_sequence))
print(len(identifiers))
out.write(bytes(''.join(identifiers), 'UTF-8'))
out.write(bytes('\n', 'UTF-8'))
out.write(bytes('\n'.join(n_sequence), 'UTF-8'))
out.write(bytes('\n', 'UTF-8'))
out.close()

in_size = os.path.getsize(in_file)
out_size = os.path.getsize(out_file)
Пример #40
0
def cluster_contacts_by_title(csv_file):
    def score(title1, title2):

        return DISTANCE((title1), (title2))

    all_titles = []
    all_titles.append("Student")
    all_titles.append("Assistant Professor")
    all_titles.append("Student Ambassador")
    all_titles.append("Assistant Developer")
    all_titles.append("Human Resources")
    all_titles.append("Software Developer")
    all_titles.append("Head, Technical Affairs-Software")
    all_titles.append("Sofware Engineer")

    all_titles.append("Software Engineer")
    all_titles.append("Design Secretary")
    all_titles.append("Telesales Executive")
    all_titles.append("Filmmaker")
    all_titles.append("Writer")
    all_titles.append("Data Developer")
    all_titles.append("Software Developmer")
    all_titles.append("Co-founder")

    all_titles.append("Assistant Manager")

    all_titles.append("Management Trainee - Operations")
    all_titles.append("Oracle Database Administrator")
    all_titles.append("Key Account Manager")
    all_titles.append("Engineering Manager")
    all_titles.append("Talent Acquisition Manager")
    all_titles.append("Wireless Protocol Test Intern")

    all_titles.append("HR Executive")
    all_titles.append("IT Company")
    all_titles.append("Business Development Manager")
    all_titles.append("Member of Technical Staff")
    all_titles.append("Web Designer")
    all_titles.append("ECE Student")
    all_titles.append("Intern")
    all_titles.append("Head of Growth")

    all_titles.append("SA")
    all_titles.append("Manager (Technology)")
    all_titles.append("Systems Engineer")
    all_titles.append("Technical Team Member")
    all_titles.append("Business Developer")
    all_titles.append("system engineer")
    all_titles.append("Infrastructure Developer")
    all_titles.append("Engineer")
    all_titles.append("Mechanical Engineer")
    all_titles.append("Student Technical Assistant")
    all_titles.append("Senior Software Engineer")
    all_titles.append("Senior Software Developer")
    all_titles.append("Associate Professor")
    all_titles.append("Professor")
    all_titles.append("Software developer")
    all_titles.append("Director - Software Engineering")
    all_titles.append("Product Manager")

    hc = HierarchicalClustering(all_titles, score)

    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)
    print clusters

    score_matrix = []
    min_d = 1000000
    for title1 in all_titles:
        temp = []
        for title2 in all_titles:
            li1 = title1.split(",")
            li2 = title2.split(",")
            for ll1 in li1:
                min_d = 100000
                for ll2 in li2:
                    #  print ll1,ll2
                    d = score(ll1, ll2)
                    # print d
                    min_d = min(min_d, d)
                #print "done"

                #   print d
            temp.append(min_d)
        score_matrix.append(temp)
#  print score_matrix
    print len(all_titles)

    i = j = k = l = 0
    mini = 10000
    for l1 in score_matrix:
        j = 0
        for l2 in l1:
            if l2 < mini and i != j:
                mini = l2
                k = i
                l = j
            j = j + 1
        i = i + 1


#    print "%d %d",(k,l)
#    print mini
    clusters = [c for c in clusters if len(c) > 1]
    # print clusters
    # Round up contacts who are in these clusters and group them together
    transforms = [
        ('Sr.', 'Senior'),
        ('Sr', 'Senior'),
        ('Jr.', 'Junior'),
        ('Jr', 'Junior'),
        ('CEO', 'Chief Executive Officer'),
        ('COO', 'Chief Operating Officer'),
        ('CTO', 'Chief Technology Officer'),
        ('CFO', 'Chief Finance Officer'),
        ('VP', 'Vice President'),
    ]

    separators = ['/', 'and', '&']

    csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"')
    contacts = [row for row in csvReader]

    all_titles = []
    for i, _ in enumerate(contacts):
        if contacts[i]['Job Title'] == '':
            contacts[i]['Job Titles'] = ['']
            continue
        titles = [contacts[i]['Job Title']]
        for title in titles:
            for separator in separators:
                if title.find(separator) >= 0:
                    titles.remove(title)
                    titles.extend([
                        title.strip() for title in title.split(separator)
                        if title.strip() != ''
                    ])

        for transform in transforms:
            titles = [title.replace(*transform) for title in titles]
        contacts[i]['Job Titles'] = titles
    clustered_contacts = {}
    for cluster in clusters:
        clustered_contacts[tuple(cluster)] = []
        for contact in contacts:
            for title in contact['Job Titles']:
                if title in cluster:
                    clustered_contacts[tuple(cluster)].append(
                        '%s %s ' %
                        (contact['First Name'], contact['Last Name']))

    return clustered_contacts
def cluster_contacts_by_title(csv_file):

    transforms = [
        ('Sr.', 'Senior'),
        ('Sr', 'Senior'),
        ('Jr.', 'Junior'),
        ('Jr', 'Junior'),
        ('CEO', 'Chief Executive Officer'),
        ('COO', 'Chief Operating Officer'),
        ('CTO', 'Chief Technology Officer'),
        ('CFO', 'Chief Finance Officer'),
        ('VP', 'Vice President'),
        ]

    separators = ['/', 'and', '&']

    csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"')
    contacts = [row for row in csvReader]

    # Normalize and/or replace known abbreviations
    # and build up list of common titles

    all_titles = []
    for i, _ in enumerate(contacts):
        if contacts[i]['Job Title'] == '':
            contacts[i]['Job Titles'] = ['']
            continue
        titles = [contacts[i]['Job Title']]
        for title in titles:
            for separator in separators:
                if title.find(separator) >= 0:
                    titles.remove(title)
                    titles.extend([title.strip() for title in title.split(separator)
                                  if title.strip() != ''])

        for transform in transforms:
            titles = [title.replace(*transform) for title in titles]
        contacts[i]['Job Titles'] = titles
        all_titles.extend(titles)

    all_titles = list(set(all_titles))

    print "Scoring...." , "\n"
    # Define a scoring function
    def score(title1, title2):
        return DISTANCE(set(title1.split()), set(title2.split()))

    # Feed the class your data and the scoring function
    hc = HierarchicalClustering(all_titles, score)

    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)

    # Remove singleton clusters
    clusters = [c for c in clusters if len(c) > 1]

    # Round up contacts who are in these clusters and group them together
    print "Clustering contacts by title...." , "\n"
    clustered_contacts = {}
    for cluster in clusters:
        clustered_contacts[tuple(cluster)] = []
        for contact in contacts:
            for title in contact['Job Titles']:
                if title in cluster:
                    clustered_contacts[tuple(cluster)].append('%s %s'
                            % (contact['First Name'], contact['Last Name']))

    return clustered_contacts