def testClusterLen0(self):
     """
     Testing if hierarchical clustering an empty list returns an empty list
     """
     cl = HierarchicalClustering([], lambda x, y: abs(x - y))
     self.assertEqual([], cl.getlevel(40))
 def testMultiprocessing(self):
     cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y),
                                 num_processes=4)
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(40)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
 def testUnmodifiedData(self):
     cl = HierarchicalClustering(self.__data, self.sim)
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(0.5)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
 def testDataTypes(self):
     "Test for bug #?"
     cl = HierarchicalClustering(self.__data, self.sim)
     for item in cl.getlevel(0.5):
         self.assertEqual(type(item), type([]),
                          "Every item should be a list!")
 def testUnmodifiedData(self):
     cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y))
     new_data = []
     [new_data.extend(_) for _ in cl.getlevel(40)]
     self.assertEqual(sorted(new_data), sorted(self.__data))
Exemplo n.º 6
0
def find_symelems(sele_or_xforms="all", verbose=False):
    xforms = sele_or_xforms
    if isinstance(sele_or_xforms, basestring):
        xforms, maxrms = get_xforms_by_chain(sele_or_xforms, verbose=True)
    elif not isinstance(sele_or_xforms, dict):
        raise ValueError
    symelems = list()
    maxangerr = 0.0
    for c, x in xforms.items():
        assert len(c) == 2
        assert isinstance(x, Xform)
        if c[0] == c[1]:
            continue
        dis = x.t.length()
        if dis > 5.0:
            continue
        axis, ang = x.rotation_axis()
        nfold = round(math.pi * 2.0 / ang)
        angerr = abs(ang - math.pi * 2.0 / nfold) * 180.0 / math.pi
        if verbose:
            print "candidate symelem:", nfold, c, angerr, axis
        if angerr > 360.0 / nfold / 8.0:
            continue  # require unambiguous symelems
        maxangerr = max(maxangerr, angerr * nfold)
        symelems.append((nfold, axis, c, angerr))

    def symelemdis(x, y): return line_line_angle_degrees(
        x[1], y[1]) if x[0] == y[0] else 9e9
    if verbose:
        for se1, se2 in filter(lambda t: t[0] < t[1], product(symelems, symelems)):
            if se1[0] == se2[0]:
                print se1
                print se2
                print symelemdis(se1, se2), "degrees"
                print
    hier = HierarchicalClustering(symelems, symelemdis)
    thresh = 6.0
    clusters = hier.getlevel(thresh)
    print "number of symmetry element clusters at threshold", thresh, "degrees is", len(clusters)
    centers0 = list()
    maxaxiserr = 0.0
    for clust in clusters:
        print "symelem cluster:", clust
        center = list(clust[0])
        center[2] = list((center[2],))
        for i in range(1, len(clust)):
            ax = clust[i][1]
            center[1] = center[1] + (ax if ax.dot(center[1]) > 0 else -ax)
            center[2].append(clust[i][2])
            center[3] = max(center[3], clust[i][3])
        center[1].normalize()
        centers0.append(center)
        axiserr = 0.0
        for c in clust:
            axiserr = max(axiserr, 1.0 - abs(center[1].dot(c[1])))
        maxaxiserr = max(maxaxiserr, axiserr)
    # sort on nfold, then on number of chain pairs in cluster
    centers0 = sorted(centers0, cmp=lambda x, y: cmp(
        y[0], x[0]) if x[0] != y[0] else cmp(len(y[2]), len(x[2])))
    centers = list()
    for center in centers0:
        if verbose:
            print "DEBUG prune center:", center
        seenit = False
        for censeen in centers:
            remainder = abs((censeen[0] / center[0]) % 1.0)
            if verbose:
                print "   ", remainder, censeen
            if remainder > 0.01:
                continue  # not a symmetry multiple
            if 1.0 - abs(center[1].dot(censeen[1])) < 0.01:
                seenit = True  # axis are same
        if not seenit:
            centers.append(center)
    print "centers:"
    cen_of_geom = com("(" + sele_or_xforms + ") and (name CA and not HET)")
    for center in centers:
        print center
        # if center[0]>2.1: continue
        # showvecfrompoint(50*center[1],cen_of_geom)
    return centers, maxrms, maxangerr, maxaxiserr
    def _align_features_cluster(self, m, rt_diff_cutoff, fdr_cutoff,
                                aligned_fdr_cutoff, method):
        """ Align features by clustering all peakgroups 

        This algorithm will find the best peakgroup cluster over all runs and
        then select all peakgroups belonging to the cluster.

        It does not treat heavy/light specially (they are treated like two independent runs).
        """

        verb = self.verbose

        if verb:
            print "00000000000000000000000000000000000 new peptide (cluster)", m.getAllPeptides(
            )[0].get_id()

        # i) get all RTs above the cutoff
        for p in m.getAllPeptides():  # loop over all peptides
            pg = p.get_best_peakgroup()
            if verb:
                print "best rt", pg.get_normalized_retentiontime(
                ), pg.peptide.run.get_id(), pg.get_fdr_score()

        groups = [
            pg for p in m.getAllPeptides()  # loop over all peptides
            for pg in p.get_all_peakgroups()  # loop over all peakgroups
            if pg.get_fdr_score() < aligned_fdr_cutoff
        ]

        # Check for empty groups
        if len(groups) == 0:
            return

        # do the clustering
        from cluster import HierarchicalClustering
        cl = HierarchicalClustering(
            groups, lambda x, y: abs(x.get_normalized_retentiontime() - y.
                                     get_normalized_retentiontime()))
        clusters_rt = cl.getlevel(
            rt_diff_cutoff)  # for large clusters, this is the the bottleneck!
        clusters_rt_obj = [Cluster(c) for c in clusters_rt]
        # if there was only one group, we need to prepare a special object of size one
        if len(groups) == 1: clusters_rt_obj = [Cluster(groups)]

        if verb: print "==== Clusters "
        # make sure only one is selected from each run...
        for i, c in enumerate(clusters_rt_obj):
            c.select_one_per_run(self.verbose)
            if verb:
                print " - Cluster with score", c.getTotalScore(), "at", \
                  c.getMedianRT(), "+/-", c.getRTstd() , "(norm_score %s)" %\
                  (float(c.getTotalScore())/((aligned_fdr_cutoff/2)**len(c.peakgroups)))
                for pg in c.peakgroups:
                    print "   = Have member", pg.print_out()

        # Get best cluster by length-normalized best score.
        #   Length normalization divides the score by the expected probability
        #   values if all peakgroups were chosen randomly (assuming equal
        #   probability between 0 and aligned_fdr_cutoff, the expected value
        #   for a random peakgroup is "aligned_fdr_cutoff/2") and thus the
        #   expected random value of n peakgroups would be (aligned_fdr_cutoff/2)^n
        bestcluster = min(clusters_rt_obj,
                          key=(lambda x: x.getTotalScore() / ((
                              (aligned_fdr_cutoff / 2)**len(c.peakgroups)))))

        clusters_rt_obj.sort(lambda x, y: cmp(
            x.getTotalScore() / ((aligned_fdr_cutoff / 2)**len(x.peakgroups)),
            y.getTotalScore() / ((aligned_fdr_cutoff / 2)**len(y.peakgroups))))
        for i, c in enumerate(clusters_rt_obj):
            for pg in c.peakgroups:
                pg.setClusterID(i + 1)
Exemplo n.º 8
0
def cluster_contacts_by_title(csv_file):

    transforms = [
        ('Sr.', 'Senior'),
        ('Sr', 'Senior'),
        ('Jr.', 'Junior'),
        ('Jr', 'Junior'),
        ('CEO', 'Chief Executive Officer'),
        ('COO', 'Chief Operating Officer'),
        ('CTO', 'Chief Technology Officer'),
        ('CFO', 'Chief Finance Officer'),
        ('VP', 'Vice President'),
    ]

    separators = ['/', 'and', '&']

    csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"')
    contacts = [row for row in csvReader]

    # Normalize and/or replace known abbreviations
    # and build up list of common titles

    all_titles = []
    for i, _ in enumerate(contacts):
        if contacts[i]['Job Title'] == '':
            contacts[i]['Job Titles'] = ['']
            continue
        titles = [contacts[i]['Job Title']]
        for title in titles:
            for separator in separators:
                if title.find(separator) >= 0:
                    #titles.remove(title)
                    titles.extend([
                        title.strip() for title in title.split(separator)
                        if title.strip() != ''
                    ])

        for transform in transforms:
            titles = [title.replace(*transform) for title in titles]
        contacts[i]['Job Titles'] = titles
        all_titles.extend(titles)

    all_titles = list(set(all_titles))

    # Define a scoring function
    def score(title1, title2):
        return DISTANCE(set(title1.split()), set(title2.split()))

    # Feed the class your data and the scoring function
    hc = HierarchicalClustering(all_titles, score)

    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)

    # Remove singleton clusters
    clusters = [c for c in clusters if len(c) > 1]

    # Round up contacts who are in these clusters and group them together

    clustered_contacts = {}
    for cluster in clusters:
        clustered_contacts[tuple(cluster)] = []
        for contact in contacts:
            for title in contact['Job Titles']:
                if title in cluster:
                    clustered_contacts[tuple(cluster)].append(
                        '%s %s' %
                        (contact['First Name'], contact['Last Name']))

    return clustered_contacts
    all_titles.extend(titles)

all_titles = list(set(all_titles))

######## Begin: HAC ########

# Define a scoring function


def score(title1, title2):
    return DISTANCE(set(title1.split()), set(title2.split()))


# Feed the class your data and the scoring function

hc = HierarchicalClustering(all_titles, score)

# Cluster the data according to a distance threshold

clusters = hc.getlevel(DISTANCE_THRESHOLD)

# Remove singleton clusters
# clusters = [c for c in clusters if len(c) > 1]

######## End: HAC ########

# Round up contacts who are in these clusters and group them together

clustered_contacts = {}
for cluster in clusters:
    clustered_contacts[tuple(cluster)] = []
def run(data):
    "Basic Hierachical clustering test with strings"
    cl = HierarchicalClustering(data, sim)
    print(cl.getlevel(0.5))
Exemplo n.º 11
0
    'http://view.inews.qq.com/a/20170613A03NY600',
    'http://view.inews.qq.com/a/20170612A065L900',
    'http://view.inews.qq.com/a/20170611A03T5V00',
    'http://news_and_blog.qq.com/zt2017/thirteen_in/allpc.htm',
    'http://media.news_and_blog.qq.com/mediaplus/home.htm',
    'http://news_and_blog.qq.com/zt2014/2014qtnews/ccybspxd.htm',
    'http://weather.news_and_blog.qq.com/',
    'http://news_and_blog.qq.com/newssh/shwx/shehuiwanxiang.htm',
    'http://news_and_blog.qq.com/dc_article2016/tagsList.htm?tags=%E5%81%87%E5%A9%9A%E5%A7%BB',
    'http://news_and_blog.qq.com/',
    'http://news_and_blog.qq.com/',
    'http://news_and_blog.qq.com/zt2014/2014qtnews/ccybspxd.htm',
    'http://news_and_blog.qq.com/dc_article2016/tagsList.htm?tags=%E9%82%BB%E5%B1%85',
    'http://news_and_blog.qq.com/',
    'http://news_and_blog.qq.com/zt2014/2014qtnews/ccybspxd.htm',
    'http://news_and_blog.qq.com/original/tuhua/spacedelivery.html',
    'http://news_and_blog.qq.com/photon/photostory/eyhz.htm',
]


# 计算url之间的距离
# 使用difflib中的SequenceMatcher计算
def distance(url1, url2):
    ratio = SequenceMatcher(None, url1, url2).ratio()
    return 1.0 - ratio


# 执行层次聚类
hc = HierarchicalClustering(urls, distance)
clusters = hc.getlevel(0.2)
pprint.pprint(clusters)
Exemplo n.º 12
0
              'tweets': 'no',
              'phd': 'yes'
          }, False)]
dt = DecisionTree()
tree = dt.build_tree_id3(inputs)
print(tree)

new_input = {'level': 'Mid', 'lang': 'Python', 'tweets': 'no', 'phd': 'no'}
label = dt.classify(tree, new_input)
print(label)

inputs = [[19, 28], [21, 27], [20, 23], [28, 13], [11, 15], [13, 13], [-49, 0],
          [-46, 5], [-41, 8], [-49, 15], [-34, -1], [-22, -16], [-19, -11],
          [-25, -9], [-11, -6], [-12, -8], [-14, -5], [-18, -3], [-13, -19],
          [-9, -16]]
hcl = HierarchicalClustering()
base_cluster = hcl.bottom_up_cluster(inputs)
print(base_cluster)
clusters = hcl.generate_clusters(base_cluster, 3)
print(clusters)

users = Table(["user_id", "name", "num_friends"])
users.insert([0, "Hero", 0])
users.insert([1, "Dunn", 2])
users.insert([2, "Sue", 3])
users.insert([3, "Chi", 3])
users.insert([4, "Thor", 3])
users.insert([5, "Clive", 2])
users.insert([6, "Hicks", 3])
users.insert([7, "Devin", 2])
users.insert([8, "Kate", 2])