예제 #1
0
 def prune(self, prune_map, show_progress=True):
     ''' Helper method to remove terms (fields) of our matrix
         prune_map is a list of 0's and 1's of same length as self.terms.
         For each term, if 0, then remove it, otherwise keep it.
     '''
     if not(prune_map) or len(prune_map) != len(self.terms):
         return False
     if show_progress:
         print '  Pruning terms list ...'
     new_terms =  SuperList()
     for i in range(len(prune_map)-1,-1,-1):
         if prune_map[i] == 1:
             #print self.terms[i]
             new_terms.append(self.terms[i])
     self.terms = new_terms
     if show_progress:
         print '  Pruning documents ...'
     p = Progress(n=len(self), percent=10)
     for doc in self.docs:
         new_doc_terms =  SuperList()
         for i in range(len(prune_map)-1,-1,-1):
             if prune_map[i] == 1:
                 new_doc_terms.append(doc['terms'][i])
         doc['terms'] = new_doc_terms
         if show_progress:
             p.show(message='  Pruning progress:')
예제 #2
0
파일: matrix.py 프로젝트: vatsrahul/irlib
    def add_doc(self,
                doc_id='',
                doc_class='',
                doc_terms=[],
                frequency=False,
                do_padding=False,
                unique_ids=False,
                meta_data={}):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
                       Terms can be typles; string and frequencies
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
            unique_ids: When true, if two documents are added with same id,
                        then their terms are summed up into only one record.
            meta_data: More fields to add to the document, for your own use.
        '''
        if not doc_terms:
            raise ValueError('doc_terms cannot be empty')
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        # Discard anything not in whitelist if it is not empty
        if self.whitelist:
            doc_terms = [t for t in doc_terms if t in self.whitelist]
        # Discard anything in stopwords if not empty
        if self.blacklist:
            doc_terms = [t for t in doc_terms if t not in self.blacklist]
        for term in doc_terms:
            if type(term) == tuple:
                term_idx = self.terms.unique_append(term[0])
                my_doc_terms.increment_after_padding(term_idx, term[1])
            else:
                term_idx = self.terms.unique_append(term)
                if frequency:
                    my_doc_terms.increment_after_padding(term_idx, 1)
                else:
                    my_doc_terms.insert_after_padding(term_idx, 1)
        # In the rare event when whitelisting causes an empty doc_terms list
        # We add at least one zero in the list of my_doc_terms
        if not my_doc_terms:
            zeros = [float(0)] * len(self.vocabulary())
            my_doc_terms = SuperList(zeros)

        doc_data = {'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}

        for key in meta_data:
            doc_data[key] = meta_data[key]

        if unique_ids:
            self.docs.add_unique(doc_data)
        else:
            self.docs.append(doc_data)

        if do_padding:
            self.do_padding()
예제 #3
0
파일: matrix.py 프로젝트: vatsrahul/irlib
 def load(self, filename, delimiter='\t', header=True):
     ''' Loads matrix from CSV/TSV file
     '''
     with open(filename, 'r') as fd:
         header_line = fd.readline()
         header_data = header_line.strip().split(delimiter)
         # First 2 columns are id and class
         self.terms = SuperList(header_data[2:])
         for line in fd.readlines():
             doc_data = {
                 'id': line[0],
                 'class': line[1],
                 'terms': SuperList(line[2:])
             }
             self.docs.append(doc_data)
     fd.close()
예제 #4
0
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     for term in doc_terms:
         term_idx = self.terms.unique_append(term)
         #my_doc_terms.insert_after_padding(self.terms.index(term))
         if frequency:
             my_doc_terms.increment_after_padding(term_idx,1)
         else:
             my_doc_terms.insert_after_padding(term_idx,1)
     self.docs.append({  'id': doc_id, 
                         'class': doc_class, 
                         'terms': my_doc_terms})
     # Update list of document classes if new class seen.
     #self.classes.unique_append(doc_class)
     if self.classes.has_key(doc_class):
         self.classes[doc_class].add(my_doc_terms)
     else:
         self.classes[doc_class] = my_doc_terms
     if do_padding: 
         self.do_padding()
예제 #5
0
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles; string and frequencies
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     # Discard anything not in whitelist if it is not empty
     if self.whitelist:
         doc_terms = [t for t in doc_terms if t in self.whitelist]
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx,term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx,1)
             else:
                 my_doc_terms.insert_after_padding(term_idx,1)
     # In the rare event when whitelisting causes an empty doc_terms list
     # We add at least one zero in the list of my_doc_terms
     if not my_doc_terms:
         zeros = [float(0)] * len(self.vocabulary())
         my_doc_terms = SuperList(zeros)
     self.docs.append({  'id': doc_id, 
                         'class': doc_class, 
                         'terms': my_doc_terms})
     # Update list of document classes if new class seen.
     #self.classes.unique_append(doc_class)
     #if self.classes.has_key(doc_class):
     #else:
     #    self.classes[doc_class].add(my_doc_terms)
     #    self.classes[doc_class] = my_doc_terms
     if do_padding: 
         self.do_padding()
예제 #6
0
 def tf_idf(self, do_idf=True):
     ''' Converts matrix to tf.idf values
         do_idf: if False, convert to tf only
     '''        
     N = len(self.docs)
     df = SuperList([0] * len(self.terms))
     for doc in self.docs:
         row = SuperList([0] * len(self.terms))
         for idx in range(len(self.terms)):
             if doc['terms'][idx] > 0:
                 row[idx] = 1
         df.add(row)
     
     for doc in self.docs:
         for idx in range(len(self.terms)):
             tf = self._log_tf(doc['terms'][idx])
             idf = math.log10(float(N) / df[idx])
             if do_idf:
                 doc['terms'][idx] = tf * idf
             else:
                 doc['terms'][idx] = tf
예제 #7
0
 def __init__(self, matrix):
     self.mx = matrix
     self.N  = 0
     self.classes = {}
     self.terms = SuperList()       
     for c in self.mx.classes:
         self.classes[c] = {}
         self.classes[c]['terms'] = self.mx.classes[c]
         self.classes[c]['total'] = sum(self.classes[c]['terms'])
         self.terms.add(self.classes[c]['terms'])
         self.N += self.classes[c]['total']
     self.mi_terms = []
예제 #8
0
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False, stopwords=[]):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles; string and frequencies
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Useless here
         stopwords: If not empty, ignore those stop words in doc_terms
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     # Discard anything not in whitelist if it is not empty
     if self.whitelist:
         doc_terms = [t for t in doc_terms if t in self.whitelist]
     # Discard anything in stopwords if not empty
     if stopwords:
         doc_terms = [t for t in doc_terms if t not in stopwords]
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx,term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx,1)
             else:
                 my_doc_terms.insert_after_padding(term_idx,1)
     #self.docs.append({  'id': doc_id, 
     #                    'class': doc_class, 
     #                    'terms': my_doc_terms})
     found = 0
     for doc in self.docs:
         if doc['class'] == doc_class:
             doc['terms'].add(my_doc_terms)
             found = 1
     if not found:        
         self.docs.append({'id': doc_id, 
                           'class': doc_class, 
                           'terms': my_doc_terms}) 
     if do_padding: 
         self.do_padding()  
예제 #9
0
 def __init__(self, whitelist=[]):
     ''' Initilize our matrix.
         whitelist: If not empty, discard any terms not in whitelist,
                    when adding new terms via add_doc()
         terms: We will populate this with our vocabulary of terms
         docs: This is our actual 2D matrix terms/docs.
               A list of the following dictionary,
               { 'id': Unique ID to each document, 
                 'class': In case of labeled data, doc class label, 
                 'terms': list of 1's and 0's, i.e. term Frequencies.
               }
     '''
     # List of unique terms (vocabulary)
     self.terms = SuperList()
     # List of document classes and terms summary
     #self.classes = {}
     self.docs = []
     self.whitelist = whitelist
예제 #10
0
 def query_to_vector(self, q_terms, frequency=False,):
     ''' Converts query to a list alligned with our self.terms.
         Terms not seen before will be ignored.
         q_terms: list of query terms
         frequency: return a multinomial or multivariate list?
     '''
     my_query_vector = SuperList()
     my_query_vector.expand(new_len=len(self.terms))
     for term in q_terms:
         try:
             term_idx = self.terms.index(term)
         except:
             # Term not seen before, skip
             continue
         #print term, self.terms.index(term)
         if frequency:
             my_query_vector.increment_after_padding(term_idx,1)
         else:
             my_query_vector.insert_after_padding(term_idx,1)
     return my_query_vector
예제 #11
0
 def to_be_deleted__getitem__(self, term):
     'Returns occurences of term in all documents'
     if not term in self:
         return SuperList()
     col = [doc['terms'][self.terms.index(term)] for doc in self.docs]
     return SuperList(col)
예제 #12
0
 def __init__(self):
     # List of unique terms (vocabulary)
     self.terms = SuperList()
     # List of document classes and terms summary
     self.classes = {}
     self.docs = []