示例#1
0
def init(topic_file):
    """
    Parses a topic file and returns all of the documents
    and the list of words in it.
    
    Key arguments:
    topic_file -- the topic file.
    """
    bible = dotdict()
    bible.topics = {}
    bible.words = dotdict()
    docs = []

    dir = os.path.dirname(topic_file)

    topic_handle = open(topic_file, 'r')
    for topic in topic_handle:
        if topic == "\n":
            continue

        topic = topic[:-1]

        # Read in each topic file.
        docs_file = dir + '/' + topic + '.txt'
        docs_handle = open(docs_file, 'r')

        # Add a new topic to the bible.
        topic = topic.upper()
        bible.topics[topic] = []

        # Split the file based on the document id and its text.
        split = re.split(r"(--\w+--)", str(docs_handle.read()),
                         re.I | re.M)[1:]

        # For each document, find all the words.
        i = 0
        while i < len(split):
            id = split[i][2:-2]
            words = re.findall(r'\w+', split[i + 1].upper())
            doc = Doc.factory(topic, id, words, bible)
            docs.append(doc)
            bible.topics[topic].append(doc)
            i += 2

        docs_handle.close()

    topic_handle.close()

    # Run through each document and init the TFIDF vector.
    for doc in docs:
        doc.init(bible)

    # Set the docs in the bible
    bible.docs = docs

    return bible, docs
示例#2
0
def init(topic_file):
    """
    Parses a topic file and returns all of the documents
    and the list of words in it.
    
    Key arguments:
    topic_file -- the topic file.
    """
    bible = dotdict()
    bible.topics = {}
    bible.words = dotdict()  
    docs = []
    
    dir = os.path.dirname(topic_file)
    
    topic_handle = open(topic_file, 'r')
    for topic in topic_handle:
        if topic == "\n":
            continue
        
        topic = topic[:-1]
                
        # Read in each topic file.
        docs_file = dir + '/' + topic + '.txt'
        docs_handle = open(docs_file, 'r')
        
        # Add a new topic to the bible.
        topic = topic.upper()
        bible.topics[topic] = []
        
        # Split the file based on the document id and its text.
        split = re.split(r"(--\w+--)", str(docs_handle.read()), re.I | re.M)[1:]
        
        # For each document, find all the words.
        i = 0
        while i < len(split):
            id = split[i][2:-2]
            words = re.findall(r'\w+', split[i + 1].upper())
            doc = Doc.factory(topic, id, words, bible)
            docs.append(doc)
            bible.topics[topic].append(doc)
            i += 2
        
        docs_handle.close()
    
    topic_handle.close()
    
    # Run through each document and init the TFIDF vector.
    for doc in docs:
        doc.init(bible)
    
    # Set the docs in the bible
    bible.docs = docs
    
    return bible, docs
示例#3
0
    def execute(self, docs):
        """
        @see parent
        """
        clusters = []

        # First cluster contains all of the docs.
        cluster = dotdict()
        cluster.docs = []

        for doc in self.bible.docs:
            doc.cluster = cluster
            cluster.docs.append(doc)

        cluster.centroid = self.centroid(cluster.docs)

        # Append the first cluster to the cluster list.
        clusters.append(cluster)

        while len(clusters) != self.k:
            # Use the abstract select cluster method.
            cluster = self.select_cluster(clusters)

            # Remove this cluster from the current set because it will be split.
            clusters.remove(cluster)

            max_sim = float("-inf")
            max_bicluster = None

            for i in range(self.iter):
                # Free the docs from whatever cluster they are in.
                for doc in cluster.docs:
                    doc.cluster = None

                kmeans = KMeans(2)
                kmeans.bible = self.bible

                kmeans.execute(cluster.docs)
                bicluster = kmeans.clusters

                sim = kmeans.similarity()
                if sim > max_sim:
                    max_sim = sim
                    max_bicluster = bicluster

            # Re-assign the documents to their respective max bicluster.
            for cluster in bicluster:
                for doc in cluster.docs:
                    doc.cluster = cluster

            # Add the new max bicluster to the current cluster set.
            clusters.extend(bicluster)

        self.clusters = clusters
示例#4
0
 def execute(self, docs):
     """
     @see parent
     """
     clusters = []
     
     # First cluster contains all of the docs.
     cluster = dotdict()
     cluster.docs = []
     
     for doc in self.bible.docs:
         doc.cluster = cluster
         cluster.docs.append(doc)
     
     cluster.centroid = self.centroid(cluster.docs)
     
     # Append the first cluster to the cluster list.
     clusters.append(cluster)
     
     while len(clusters) != self.k:
         # Use the abstract select cluster method.
         cluster = self.select_cluster(clusters)
                     
         # Remove this cluster from the current set because it will be split.
         clusters.remove(cluster)
         
         max_sim = float("-inf")
         max_bicluster = None
                     
         for i in range(self.iter):
             # Free the docs from whatever cluster they are in.
             for doc in cluster.docs:
                 doc.cluster = None
             
             kmeans = KMeans(2)
             kmeans.bible = self.bible
             
             kmeans.execute(cluster.docs)
             bicluster = kmeans.clusters
             
             sim = kmeans.similarity()
             if sim > max_sim:
                 max_sim = sim
                 max_bicluster = bicluster
         
         # Re-assign the documents to their respective max bicluster.
         for cluster in bicluster:
             for doc in cluster.docs:
                 doc.cluster = cluster
         
         # Add the new max bicluster to the current cluster set.
         clusters.extend(bicluster)
     
     self.clusters = clusters
示例#5
0
    def execute(self, docs):
        """
        @see parent
        """
        # To start, every document is its own cluster.
        clusters = []
        for doc in docs:
            cluster = dotdict()

            doc.cluster = cluster
            cluster.docs = []
            cluster.docs.append(doc)
            cluster.centroid = dict(doc.tfidf)
            clusters.append(cluster)

        # Continue merging until we reach k clusters.
        while len(clusters) != self.k:
            clusters = self.merge(clusters)

        self.clusters = clusters
示例#6
0
 def execute(self, docs):
     """
     @see parent
     """
     # To start, every document is its own cluster.
     clusters = []
     for doc in docs:
         cluster = dotdict()
         
         doc.cluster = cluster
         cluster.docs = []
         cluster.docs.append(doc)
         cluster.centroid = dict(doc.tfidf)
         clusters.append(cluster)
     
     # Continue merging until we reach k clusters.
     while len(clusters) != self.k:
         clusters = self.merge(clusters)
     
     self.clusters = clusters