def testCompleteLinkage(self): "Basic Hierarchical Clustering test with integers" cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y), linkage='complete') result = cl.getlevel(40) # sort the values to make the tests less prone to algorithm changes result = sorted([sorted(_) for _ in result]) expected = [ [24], [84], [124, 131, 134], [336, 365, 365], [391, 398], [518], [542, 564], [594], [676], [791], [835], [940, 956, 971], ] self.assertEqual(result, expected)
def testDataTypes(self): "Test for bug #?" cl = HierarchicalClustering(self.__data, self.sim) for item in cl.getlevel(0.5): self.assertEqual( type(item), type([]), "Every item should be a list!")
def buildHcluster(data, threshold): """ Description:Build Hierachical Cluster Input: data: e.g. data = [ [12,12],[34,34], [23,23],[32,32], [46,46],[96,96], [13,13],[1,1], [4,4],[9,9]] # The first variable is key, not counted for clustering threshold: threshold distance to break cluster Output: cluster record file /searchc/save/H.cluster """ print "Clustering..." a = datetime.datetime.now() cl = HierarchicalClustering(data,distance_function,'complete') clusterH = cl.getlevel(threshold) # get h clusters b = datetime.datetime.now() print "Naming..." featureAll = readFeature('all') c = nameCluster(clusterH,featureAll) name = c[0] centroid = c[1] writeCluster('H',clusterH,name,centroid,threshold) print "Writing..." with open(path+'/log/H_'+str(threshold)+'.log','w') as outfile: outfile.write("Hierahical Clustering Log\nDate:\t"+str(a.date())+"\nStart:\t"+str(a.time())+"\nEnd:\t"+str(b.time())+"\nDuration:\t"+str(b-a)+"\nH:\t"+str(threshold)+"\nMethod:\tComplete"+"\nNo. cluster:\t"+str(len(clusterH))+"\n\n") for cluster in clusterH: outfile.write(str(len(cluster)-2)+"\n") return
def testClusterLen1(self): """ Testing if hierarchical clustering a set of length 1 returns a set of length 1 """ cl = HierarchicalClustering([876], lambda x, y: abs(x - y)) self.assertEqual([876], cl.getlevel(40))
def getSubbatch(images, image_labels, similar_thred): sizes = [(image.shape[0], image.shape[1], idx) for idx, image in enumerate(images)] cl = HierarchicalClustering( sizes, lambda x, y: abs(x[0] - y[0]) + abs(x[1] - y[1])) clusters = cl.getlevel(similar_thred) subbatches = [] sorted(clusters, key=lambda cluster: len(cluster)) for cluster in clusters: if len(cluster) > 1: ideal_size = np.median(cluster, axis=0) ideal_size = [int(i) for i in ideal_size] subbatch_im = [] subbatch_label = [] for img in cluster: if img[0] != ideal_size[0] or img[1] != ideal_size[1]: subbatch_im.append( cv2.resize(images[img[2]], (ideal_size[1], ideal_size[0]))) subbatch_label.append( cv2.resize(image_labels[img[2]], (ideal_size[1], ideal_size[0]))) else: subbatch_im.append(images[img[2]]) subbatch_label.append(image_labels[img[2]]) subbatches.append({ 'images': np.array(subbatch_im), 'labels': np.array(subbatch_label) }) else: subbatches.append({ 'images': np.array([images[cluster[0][2]]]), 'labels': np.array([image_labels[cluster[0][2]]]) }) return subbatches
def clustertitle( request ): """cluster based on title and ngram sim""" from cluster import HierarchicalClustering def sim( a, b ): return 1 - NGram.compare( a.title, b.title, warp=WARP, iconv=enrich ) articles = Article.objects.filter( status = "live", date_published__gte = datetime.datetime.now() - datetime.timedelta(1) ).order_by( "date_published" )[:1000] cl = HierarchicalClustering(articles, sim) # 0.7 chosen pretty much through trial and error :) res = cl.getlevel(0.7) #import pprint #pprint.pprint( cl.topo() ) clusters = [] for cluster in res: if len(cluster) > 1: node = { 'type': 'cluster', #'topic': longest_common_substring(cluster[0].title, cluster[1].title), 'topic': common_terms( [a.title for a in cluster] ), 'articles': cluster } else: node = { 'type': 'article', 'article': cluster[0] } clusters.append(node) return render( request, "clusters.html", dictionary = { "clusters": clusters, } )
def testCluster(self): "Basic Hierarchical Clustering test with integers" cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y)) cl.cluster() self.assertEqual( [[24], [84, 124, 131, 134], [336, 365, 365, 365, 398, 391], [940, 956, 971], [791], [835], [676], [518, 564, 542]], cl.getlevel(40))
def test(data, expected): cl = HierarchicalClustering(data, lambda x, y: abs(x-y)) result = cl.getlevel(5) print(sorted(data)) print result print expected assert result == expected print 'ok'
def testCluster(self): "Basic Hierachical clustering test with strings" cl = HierarchicalClustering(self.__data, self.sim) self.assertEqual( [['Nullam.'], ['Sed'], ['mi.'], ['ultricies'], ['Phasellus'], ['amet,', 'at'], ['sit', 'elit.', 'elit.', 'elit.'], ['leo', 'Lorem', 'dolor'], ['neque.', 'congue', 'consectetuer', 'consequat'], ['ipsum'], ['adipiscing']], cl.getlevel(0.5))
def cluster(unigrams): DISTANCE_THRESHOLD = 0.2 # Feed the class your data and the scoring function hc = HierarchicalClustering(unigrams, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters clusters = [c for c in clusters if len(c) > 20] return clusters
def testSingleLinkage(self): "Basic Hierarchical Clustering test with integers" def euclidian_distance(a, b): return sqrt(sum([pow(z[0] - z[1], 2) for z in zip(a, b)])) self.__data = [(1, 1), (1, 2), (1, 3)] cl = HierarchicalClustering(self.__data, euclidian_distance) result = cl.getlevel(40) self.assertIsNotNone(result)
def testIssue28(self): "Issue28 (Hierarchical Clustering)" points1D = { 'p4' : 5, 'p2' : 6, 'p7' : 10, 'p9' : 120, 'p10' : 121, 'p11' : 119, } distance_func = lambda a,b : abs(points1D[a]-points1D[b]) cl = HierarchicalClustering(list(points1D.keys()), distance_func) result = cl.getlevel(20) self.assertIsNotNone(result)
def ml(): global cluster_number input = db.session.query(ormQueue.number_of_people, ormQueue.queue_name).group_by( ormQueue.queue_name).all() queues, n_people, cluster_array = [], [], [] for elem in input: queues.append(elem.queue_name) n_people.append(int(elem.number_of_people)) cl = HierarchicalClustering(n_people, lambda x, y: abs(x - y)) res = cl.getlevel(5) Info = {'Queues_name': queues, 'Number_of_people': n_people} df = pd.DataFrame(Info, columns=['Queues_name', 'Number_of_people']) print(df) for number in range(0, len(res)): cluster_number = "Cluster" + str(number + 1) print(cluster_number) for elem in res[number]: print(elem) df.loc[df['Number_of_people'] == elem, 'Cluster'] = cluster_number print(df) df['randNumCol'] = np.random.randint(1, 6, df.shape[0]) print(df) pearsoncorr = df.corr(method='pearson') print(pearsoncorr) X = df['Number_of_people'] Y = df['randNumCol'] seed = 7 test_size = 0.25 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) # fit model no training data rf = RandomForestRegressor(n_estimators=1000, random_state=42) # Train the model on training data rf.fit([X_train], [y_train]) # Use the forest's predict method on the test data predictions = rf.predict([y_train]) # Calculate the absolute errors errors = abs(np.array(predictions) - np.array(X_test).reshape(-1, 1)) # Print out the mean absolute error (mae) print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.') return render_template('ML.html', name="Clasterization", name2="Correlation", name3="Regression Model", tables=[df.to_html()], error=errors, table=[pearsoncorr.to_html()], action="/ML")
def testCluster(self): "Basic Hierarchical Clustering test with integers" cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y)) cl.cluster() self.assertEqual([ [24], [84, 124, 131, 134], [336, 365, 365, 365, 398, 391], [940, 956, 971], [791], [835], [676], [518, 564, 542]], cl.getlevel(40))
def urls_clustering(urls): # 输入 urls # 计算url之间的距离 # 使用difflib中的SequenceMatcher计算 def distance(url1, url2): ratio = SequenceMatcher(None, url1, url2).ratio() return 1.0 - ratio # 执行层次聚类 hc = HierarchicalClustering(urls, distance) clusters = hc.getlevel(0.2) # pprint.pprint(clusters) return clusters
def hierarchical_clustering_by_title(csv_file): csvReader = csv.DictReader(codecs.open(csv_file, "rb", "utf-16"), delimiter='\t', quotechar='"') csvReader.next() contacts = [row for row in csvReader] all_titles = [] for i, _ in enumerate(contacts): if contacts[i]['Current Position'] == '': contacts[i]['Job Titles'] = [''] continue titles = [contacts[i]['Current Position']] for title in titles: for separator in separators: if title.find(separator) >= 0: titles.remove(title) titles.extend([title.strip() for title in title.split(separator) if title.strip() != '']) for transform in transforms: titles = [title.replace(*transform) for title in titles] contacts[i]['Job Titles'] = titles all_titles.extend(titles) all_titles = list(set(all_titles)) # Define a scoring function def score(title1, title2): return DISTANCE(set(title1.split()), set(title2.split())) # Feed the class your data and the scoring function hc = HierarchicalClustering(all_titles, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters clusters = [c for c in clusters if len(c) > 1] # Round up contacts who are in these clusters and group them together clustered_contacts = {} for cluster in clusters: clustered_contacts[tuple(cluster)] = [] for contact in contacts: for title in contact['Job Titles']: if title in cluster: clustered_contacts[tuple(cluster)].append('%s %s' % (contact['First Name'], contact['Last Name'])) return clustered_contacts
def breakToPeriods(arg, maximaOrder=20, clusteringGranularity = 0.5, file=False): inputAsList = [] if(file): file = open(arg, 'r') for line in file: inputAsList.append(float(line)) else: inputAsList = arg inputAsList = inputAsList if type(inputAsList) is list else inputAsList.tolist() a = np.array(inputAsList) localMax = argrelextrema(a, np.greater, 0, maximaOrder)[0].tolist() try: amplitude = np.max(a) - np.min(a) except: return [] cl = HierarchicalClustering(a.take(localMax).tolist(), lambda x,y: abs(x-y)) clusters = cl.getlevel(int(amplitude*clusteringGranularity)) if(len(clusters) == 0): return [] #print clusters max = 0 longestSeq = None if(len(clusters) == len(localMax)):#It clustered every maxima differently longestSeq = clusters else: for cluster in clusters: l = len(cluster) if(l>max): longestSeq = cluster max = l #print longestSeq if(len(longestSeq) < 2): return [] averageLength = len(inputAsList)/len(longestSeq) periods = [] indices = [inputAsList.index(x) for x in longestSeq] indices.sort() open = indices[0] for i in indices[1:]: #plt.figure() close = i strideLen = close - open if(strideLen > 0.5*averageLength and strideLen < 1.8*averageLength): period = inputAsList[open:close] periods.append(period) else: pass open = close return periods
def getCorners(intersections): cl = HierarchicalClustering(intersections, lambda p1, p2: length([p1, p2])) clusters = cl.getlevel(25) # probably want to make sure we actually have the corners at this point. # For now, I'm taking the 4 biggest clusters. cornerClusters = sorted(clusters, key=len, reverse=True)[:4] corners = map(averageCoords, cornerClusters) corners = sorted(corners, key= lambda p: p[0]) left = sorted(corners[:2], key=lambda p: p[1]) right = sorted(corners[2:], key=lambda p: p[1]) #{'top-left': left[0], 'bottom-left': left[1], # 'top-right': right[0], 'bottom-right': right[1]} return left[0], left[1], right[0], right[1]
def testCluster(self): "Basic Hierachical clustering test with strings" cl = HierarchicalClustering(self.__data, self.sim) self.assertEqual([ ['ultricies'], ['Sed'], ['Phasellus'], ['mi'], ['Nullam'], ['sit', 'elit', 'elit', 'Ut', 'amet', 'at'], ['leo', 'Lorem', 'dolor'], ['congue', 'neque', 'consectetuer', 'consequat'], ['adipiscing'], ['ipsum'], ], cl.getlevel(0.5))
def main(): pC = PhamCluster() pC.initialize_matrix() #pC.calculate_distances() #print 'scoreMatrix:', pC.scoreMatrix #print 'distMatrix:', pC.distMatrix cl = HierarchicalClustering(pC.scoreMatrix, lambda x,y: pC.get_distance(x,y)) #cutoff = raw_input('specify cutoff level:') cutoff = 1 print 'using cutoff of 1' clusters = cl.getlevel(float(cutoff)) print 'there are', len(clusters), 'clusters' print clusters print 'there are', len(clusters), 'clusters'
def testCluster(self): "Basic Hierachical clustering test with strings" self.skipTest('These values lead to non-deterministic results. ' 'This makes it untestable!') cl = HierarchicalClustering(self.__data, self.sim) self.assertEqual([ ['ultricies'], ['Sed'], ['Phasellus'], ['mi'], ['Nullam'], ['sit', 'elit', 'elit', 'Ut', 'amet', 'at'], ['leo', 'Lorem', 'dolor'], ['congue', 'neque', 'consectetuer', 'consequat'], ['adipiscing'], ['ipsum'], ], cl.getlevel(0.5))
def get_music_bars(filename): musicpage = Image.open(filename) pixels = musicpage.load() width, height = musicpage.size imgmat = [sum([1 for x in range(width) if pixels[x,y] == 0]) for y in range(height)] toplines = sorted(imgmat, reverse=True) tophundred = toplines[0:400] lineguesses = [i for i, j in enumerate(imgmat) if j in tophundred] cl = HierarchicalClustering(lineguesses, lambda x,y: abs(x-y)) staves = [x for x in cl.getlevel(15) if len(x) > 2] bands = [[min(x), max(x)] for x in staves] bars = [b for b in bands if b[1] - b[0] > 20] return bars
def testSingleLinkage(self): "Basic Hierarchical Clustering test with integers" cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y)) result = cl.getlevel(40) # sort the values to make the tests less prone to algorithm changes result = [sorted(_) for _ in result] self.assertCItemsEqual([ [24], [336, 365, 365, 391, 398], [518, 542, 564, 594], [676], [791], [835], [84, 124, 131, 134], [940, 956, 971], ], result)
def testUCLUS(self): "Basic Hierarchical Clustering test with integers" cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y), linkage='uclus') expected = [ [24], [84], [124, 131, 134], [336, 365, 365, 391, 398], [518, 542, 564], [594], [676], [791], [835], [940, 956, 971], ] result = sorted([sorted(_) for _ in cl.getlevel(40)]) self.assertEqual(result, expected)
def testAverageLinkage(self): cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y), linkage='average') # TODO: The current test-data does not really trigger a difference # between UCLUS and "average" linkage. expected = [ [24], [84], [124, 131, 134], [336, 365, 365, 391, 398], [518, 542, 564], [594], [676], [791], [835], [940, 956, 971], ] result = sorted([sorted(_) for _ in cl.getlevel(40)]) self.assertEqual(result, expected)
def hac(topic): """ Use clusters.HierarchicalClustering https://pypi.python.org/pypi/cluster/1.1.0b1 """ phrases = [phrase for phrase in topic if phrase.get('es_phrase')] # Feed the class your data and the scoring function hc = HierarchicalClustering(phrases, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # print "[hac]",len(clusters), json.dumps(clusters, indent=2) # sometimes the clustering api returns a list of dicts instead # of a list of lists. This causes an error in topic_extraction # as we are looping over the phrases if len(clusters) == 1 and isinstance(clusters[0], dict): clusters = [clusters] return clusters
def set_new_level(self, level): # Create the clusters cl = HierarchicalClustering(self._data, self._relative_levenshtein) clusteredData = cl.getlevel(level) self._parsed_clusteredData = self._parse(clusteredData) self._column_names = [ 'Group %d' % i for i in xrange(len(clusteredData)) ] # Start with the treeview and liststore creation dynamicListStoreTypes = [str for i in xrange(len(self._column_names))] self.liststore = apply(gtk.ListStore, dynamicListStoreTypes) gtk.TreeView.__init__(self, self.liststore) # Show horizontal and vertical lines self.set_grid_lines(gtk.TREE_VIEW_GRID_LINES_BOTH) # First clear the treeview for col in self.get_columns(): self.remove_column(col) # Internal variables self.current_path = None self.current_column = None self._colDict = {} for i, cname in enumerate(self._column_names): colObject = gtk.TreeViewColumn(cname) self.append_column(colObject) textRenderer = gtk.CellRendererText() colObject.pack_start(textRenderer, True) colObject.set_attributes(textRenderer, text=i) # Save this for later. See FIXME below. self._colDict[colObject] = i for i in self._parsed_clusteredData: self.liststore.append(i)
def set_new_level(self, level): # Create the clusters cl = HierarchicalClustering(self._data, self._relative_levenshtein) clusteredData = cl.getlevel(level) self._parsed_clusteredData = self._parse(clusteredData) self._column_names = ['Group %d' % i for i in xrange(len( clusteredData))] # Start with the treeview and liststore creation dynamicListStoreTypes = [str for i in xrange(len(self._column_names))] self.liststore = apply(gtk.ListStore, dynamicListStoreTypes) gtk.TreeView.__init__(self, self.liststore) # Show horizontal and vertical lines self.set_grid_lines(gtk.TREE_VIEW_GRID_LINES_BOTH) # First clear the treeview for col in self.get_columns(): self.remove_column(col) # Internal variables self.current_path = None self.current_column = None self._colDict = {} for i, cname in enumerate(self._column_names): colObject = gtk.TreeViewColumn(cname) self.append_column(colObject) textRenderer = gtk.CellRendererText() colObject.pack_start(textRenderer, True) colObject.set_attributes(textRenderer, text=i) # Save this for later. See FIXME below. self._colDict[colObject] = i for i in self._parsed_clusteredData: self.liststore.append(i)
def run_clustering(repository_parameter_name, start_date, end_date, limit): """ :param repository_parameter_name: One of the types from ``RepositoryParameter`` :param start_date: First day :param end_date: Last day :param limit: Limit the number of examined stations :return: Show clustering """ params = get_repository_parameters(repository_parameter_name) station_repository = StationRepository(*params) station_dicts = station_repository.load_all_stations(start_date, end_date, limit=limit) station_time_series_comparator = StationTimeSeriesComparator(station_dicts) stations = [Station(station_dict) for station_dict in station_dicts] cluster = HierarchicalClustering( stations, station_time_series_comparator.compare_time_series, num_processes=4) cluster.cluster() cluster.display(print_function=logging.debug) logging.info(cluster._data)
def testMultiprocessing(self): cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y), num_processes=4) new_data = [] [new_data.extend(_) for _ in cl.getlevel(40)] self.assertEqual(sorted(new_data), sorted(self.__data))
#[pt.add_row([tag, freq]) for (tag, freq) in fdist.items() if freq > 1] ##[:50] #pt.printt() #sort_list = fdist.keys() #print sort_list print "Clustering Musics" # Define a scoring function def score(music1, music2): return DISTANCE(set(music1), set(music2)) # Feed the class your data and the scoring function hc = HierarchicalClustering(musics, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters clusters = [c for c in clusters if len(c) > 1] ######## End: HAC ######## # Round up musics who are in these clusters and group them together clustered_musics = {} for cluster in clusters: clustered_musics[tuple(cluster)] = []
def test2(): cl = HierarchicalClustering(data, lambda x, y: abs(x - y)) new_data = [] for row in cl.getlevel(40): print(row) print(data)
def run(level): print('Level = {}'.format(level)) cluster = HierarchicalClustering(data, lambda x, y: abs(x-y)) result = cluster.getlevel(level) for row in result: print(row)
data = [24, 84, 124, 131, 134, 336, 365, 365, 391, 398, 518, 542, 564, 594, 676, 791, 835, 940, 956, 971] data2 = [791, 956, 676, 124, 564, 84, 24, 365, 594, 940, 398, 971, 131, 365, 542, 336, 518, 835, 134, 391] def test2(): cl = HierarchicalClustering(data, lambda x, y: abs(x - y)) new_data = [] for row in cl.getlevel(40): print(row) print(data) #[new_data.extend(_) for _ in cl.getlevel(40)] #self.assertEqual(sorted(new_data), sorted(self.__data)) def run(level): print('Level = {}'.format(level)) cluster = HierarchicalClustering(data, lambda x, y: abs(x-y)) result = cluster.getlevel(level) for row in result: print(row) print(data) run(40) #print(len(data)) #test2() cl = HierarchicalClustering(data, lambda x, y: abs(x - y)) cl.getlevel(40) print(sorted(data) == sorted(data2))
def testUnmodifiedData(self): cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y)) new_data = [] [new_data.extend(_) for _ in cl.getlevel(40)] self.assertEqual(sorted(new_data), sorted(self.__data))
def testClusterLen0(self): """ Testing if hierarchical clustering an empty list returns an empty list """ cl = HierarchicalClustering([], lambda x, y: abs(x - y)) self.assertEqual([], cl.getlevel(40))
def testUnmodifiedData(self): cl = HierarchicalClustering(self.__data, self.sim) new_data = [] [new_data.extend(_) for _ in cl.getlevel(0.5)] self.assertEqual(sorted(new_data), sorted(self.__data))
# print(n_sequence) n_sequence = zip(*n_sequence) n_sequence = [''.join(i) for i in n_sequence] n_sequence.sort(key=len, reverse=True) p_size = 0 new_seq = [] chunk = [] for seq in n_sequence: size = len(seq) if size == p_size: new_seq.append(seq) else: p_size = size if chunk: cl = HierarchicalClustering(chunk, lambda x, y: distance(x, y)) cl.getlevel(1) new_seq += cl print(new_seq) chunk = [] print(len(n_sequence)) print(len(identifiers)) out.write(bytes(''.join(identifiers), 'UTF-8')) out.write(bytes('\n', 'UTF-8')) out.write(bytes('\n'.join(n_sequence), 'UTF-8')) out.write(bytes('\n', 'UTF-8')) out.close() in_size = os.path.getsize(in_file) out_size = os.path.getsize(out_file)
def cluster_contacts_by_title(csv_file): def score(title1, title2): return DISTANCE((title1), (title2)) all_titles = [] all_titles.append("Student") all_titles.append("Assistant Professor") all_titles.append("Student Ambassador") all_titles.append("Assistant Developer") all_titles.append("Human Resources") all_titles.append("Software Developer") all_titles.append("Head, Technical Affairs-Software") all_titles.append("Sofware Engineer") all_titles.append("Software Engineer") all_titles.append("Design Secretary") all_titles.append("Telesales Executive") all_titles.append("Filmmaker") all_titles.append("Writer") all_titles.append("Data Developer") all_titles.append("Software Developmer") all_titles.append("Co-founder") all_titles.append("Assistant Manager") all_titles.append("Management Trainee - Operations") all_titles.append("Oracle Database Administrator") all_titles.append("Key Account Manager") all_titles.append("Engineering Manager") all_titles.append("Talent Acquisition Manager") all_titles.append("Wireless Protocol Test Intern") all_titles.append("HR Executive") all_titles.append("IT Company") all_titles.append("Business Development Manager") all_titles.append("Member of Technical Staff") all_titles.append("Web Designer") all_titles.append("ECE Student") all_titles.append("Intern") all_titles.append("Head of Growth") all_titles.append("SA") all_titles.append("Manager (Technology)") all_titles.append("Systems Engineer") all_titles.append("Technical Team Member") all_titles.append("Business Developer") all_titles.append("system engineer") all_titles.append("Infrastructure Developer") all_titles.append("Engineer") all_titles.append("Mechanical Engineer") all_titles.append("Student Technical Assistant") all_titles.append("Senior Software Engineer") all_titles.append("Senior Software Developer") all_titles.append("Associate Professor") all_titles.append("Professor") all_titles.append("Software developer") all_titles.append("Director - Software Engineering") all_titles.append("Product Manager") hc = HierarchicalClustering(all_titles, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) print clusters score_matrix = [] min_d = 1000000 for title1 in all_titles: temp = [] for title2 in all_titles: li1 = title1.split(",") li2 = title2.split(",") for ll1 in li1: min_d = 100000 for ll2 in li2: # print ll1,ll2 d = score(ll1, ll2) # print d min_d = min(min_d, d) #print "done" # print d temp.append(min_d) score_matrix.append(temp) # print score_matrix print len(all_titles) i = j = k = l = 0 mini = 10000 for l1 in score_matrix: j = 0 for l2 in l1: if l2 < mini and i != j: mini = l2 k = i l = j j = j + 1 i = i + 1 # print "%d %d",(k,l) # print mini clusters = [c for c in clusters if len(c) > 1] # print clusters # Round up contacts who are in these clusters and group them together transforms = [ ('Sr.', 'Senior'), ('Sr', 'Senior'), ('Jr.', 'Junior'), ('Jr', 'Junior'), ('CEO', 'Chief Executive Officer'), ('COO', 'Chief Operating Officer'), ('CTO', 'Chief Technology Officer'), ('CFO', 'Chief Finance Officer'), ('VP', 'Vice President'), ] separators = ['/', 'and', '&'] csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"') contacts = [row for row in csvReader] all_titles = [] for i, _ in enumerate(contacts): if contacts[i]['Job Title'] == '': contacts[i]['Job Titles'] = [''] continue titles = [contacts[i]['Job Title']] for title in titles: for separator in separators: if title.find(separator) >= 0: titles.remove(title) titles.extend([ title.strip() for title in title.split(separator) if title.strip() != '' ]) for transform in transforms: titles = [title.replace(*transform) for title in titles] contacts[i]['Job Titles'] = titles clustered_contacts = {} for cluster in clusters: clustered_contacts[tuple(cluster)] = [] for contact in contacts: for title in contact['Job Titles']: if title in cluster: clustered_contacts[tuple(cluster)].append( '%s %s ' % (contact['First Name'], contact['Last Name'])) return clustered_contacts
def cluster_contacts_by_title(csv_file): transforms = [ ('Sr.', 'Senior'), ('Sr', 'Senior'), ('Jr.', 'Junior'), ('Jr', 'Junior'), ('CEO', 'Chief Executive Officer'), ('COO', 'Chief Operating Officer'), ('CTO', 'Chief Technology Officer'), ('CFO', 'Chief Finance Officer'), ('VP', 'Vice President'), ] separators = ['/', 'and', '&'] csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"') contacts = [row for row in csvReader] # Normalize and/or replace known abbreviations # and build up list of common titles all_titles = [] for i, _ in enumerate(contacts): if contacts[i]['Job Title'] == '': contacts[i]['Job Titles'] = [''] continue titles = [contacts[i]['Job Title']] for title in titles: for separator in separators: if title.find(separator) >= 0: titles.remove(title) titles.extend([title.strip() for title in title.split(separator) if title.strip() != '']) for transform in transforms: titles = [title.replace(*transform) for title in titles] contacts[i]['Job Titles'] = titles all_titles.extend(titles) all_titles = list(set(all_titles)) print "Scoring...." , "\n" # Define a scoring function def score(title1, title2): return DISTANCE(set(title1.split()), set(title2.split())) # Feed the class your data and the scoring function hc = HierarchicalClustering(all_titles, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters clusters = [c for c in clusters if len(c) > 1] # Round up contacts who are in these clusters and group them together print "Clustering contacts by title...." , "\n" clustered_contacts = {} for cluster in clusters: clustered_contacts[tuple(cluster)] = [] for contact in contacts: for title in contact['Job Titles']: if title in cluster: clustered_contacts[tuple(cluster)].append('%s %s' % (contact['First Name'], contact['Last Name'])) return clustered_contacts