예제 #1
0
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     for term in doc_terms:
         term_idx = self.terms.unique_append(term)
         #my_doc_terms.insert_after_padding(self.terms.index(term))
         if frequency:
             my_doc_terms.increment_after_padding(term_idx,1)
         else:
             my_doc_terms.insert_after_padding(term_idx,1)
     self.docs.append({  'id': doc_id, 
                         'class': doc_class, 
                         'terms': my_doc_terms})
     # Update list of document classes if new class seen.
     #self.classes.unique_append(doc_class)
     if self.classes.has_key(doc_class):
         self.classes[doc_class].add(my_doc_terms)
     else:
         self.classes[doc_class] = my_doc_terms
     if do_padding: 
         self.do_padding()
예제 #2
0
파일: matrix.py 프로젝트: ljc0753/irlib
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles, string and values
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx,term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx,1)
             else:
                 my_doc_terms.insert_after_padding(term_idx,1)
     self.docs.append({  'id': doc_id, 
                         'class': doc_class, 
                         'terms': my_doc_terms})
     # Update list of document classes if new class seen.
     #self.classes.unique_append(doc_class)
     #if self.classes.has_key(doc_class):
     #else:
     #    self.classes[doc_class].add(my_doc_terms)
     #    self.classes[doc_class] = my_doc_terms
     if do_padding: 
         self.do_padding()
예제 #3
0
파일: matrix.py 프로젝트: vatsrahul/irlib
    def add_doc(self,
                doc_id='',
                doc_class='',
                doc_terms=[],
                frequency=False,
                do_padding=False,
                unique_ids=False,
                meta_data={}):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
                       Terms can be typles; string and frequencies
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
            unique_ids: When true, if two documents are added with same id,
                        then their terms are summed up into only one record.
            meta_data: More fields to add to the document, for your own use.
        '''
        if not doc_terms:
            raise ValueError('doc_terms cannot be empty')
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        # Discard anything not in whitelist if it is not empty
        if self.whitelist:
            doc_terms = [t for t in doc_terms if t in self.whitelist]
        # Discard anything in stopwords if not empty
        if self.blacklist:
            doc_terms = [t for t in doc_terms if t not in self.blacklist]
        for term in doc_terms:
            if type(term) == tuple:
                term_idx = self.terms.unique_append(term[0])
                my_doc_terms.increment_after_padding(term_idx, term[1])
            else:
                term_idx = self.terms.unique_append(term)
                if frequency:
                    my_doc_terms.increment_after_padding(term_idx, 1)
                else:
                    my_doc_terms.insert_after_padding(term_idx, 1)
        # In the rare event when whitelisting causes an empty doc_terms list
        # We add at least one zero in the list of my_doc_terms
        if not my_doc_terms:
            zeros = [float(0)] * len(self.vocabulary())
            my_doc_terms = SuperList(zeros)

        doc_data = {'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}

        for key in meta_data:
            doc_data[key] = meta_data[key]

        if unique_ids:
            self.docs.add_unique(doc_data)
        else:
            self.docs.append(doc_data)

        if do_padding:
            self.do_padding()
예제 #4
0
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False, stopwords=[]):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles; string and frequencies
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Useless here
         stopwords: If not empty, ignore those stop words in doc_terms
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     # Discard anything not in whitelist if it is not empty
     if self.whitelist:
         doc_terms = [t for t in doc_terms if t in self.whitelist]
     # Discard anything in stopwords if not empty
     if stopwords:
         doc_terms = [t for t in doc_terms if t not in stopwords]
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx,term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx,1)
             else:
                 my_doc_terms.insert_after_padding(term_idx,1)
     #self.docs.append({  'id': doc_id, 
     #                    'class': doc_class, 
     #                    'terms': my_doc_terms})
     found = 0
     for doc in self.docs:
         if doc['class'] == doc_class:
             doc['terms'].add(my_doc_terms)
             found = 1
     if not found:        
         self.docs.append({'id': doc_id, 
                           'class': doc_class, 
                           'terms': my_doc_terms}) 
     if do_padding: 
         self.do_padding()  
예제 #5
0
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles; string and frequencies
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     # Discard anything not in whitelist if it is not empty
     if self.whitelist:
         doc_terms = [t for t in doc_terms if t in self.whitelist]
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx,term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx,1)
             else:
                 my_doc_terms.insert_after_padding(term_idx,1)
     # In the rare event when whitelisting causes an empty doc_terms list
     # We add at least one zero in the list of my_doc_terms
     if not my_doc_terms:
         zeros = [float(0)] * len(self.vocabulary())
         my_doc_terms = SuperList(zeros)
     self.docs.append({  'id': doc_id, 
                         'class': doc_class, 
                         'terms': my_doc_terms})
     # Update list of document classes if new class seen.
     #self.classes.unique_append(doc_class)
     #if self.classes.has_key(doc_class):
     #else:
     #    self.classes[doc_class].add(my_doc_terms)
     #    self.classes[doc_class] = my_doc_terms
     if do_padding: 
         self.do_padding()
예제 #6
0
파일: matrix.py 프로젝트: ljc0753/irlib
 def query_to_vector(self, q_terms, frequency=False,):
     ''' Converts query to a list alligned with our self.terms.
         Terms not seen before will be ignored.
         q_terms: list of query terms
         frequency: return a multinomial or multivariate list?
     '''
     my_query_vector = SuperList()
     my_query_vector.expand(new_len=len(self.terms))
     for term in q_terms:
         try:
             term_idx = self.terms.index(term)
         except:
             # Term not seen before, skip
             continue
         #print term, self.terms.index(term)
         if frequency:
             my_query_vector.increment_after_padding(term_idx,1)
         else:
             my_query_vector.insert_after_padding(term_idx,1)
     return my_query_vector
예제 #7
0
 def query_to_vector(self, q_terms, frequency=False,):
     ''' Converts query to a list alligned with our self.terms.
         Terms not seen before will be ignored.
         q_terms: list of query terms
         frequency: return a multinomial or multivariate list?
     '''
     my_query_vector = SuperList()
     my_query_vector.expand(new_len=len(self.terms))
     for term in q_terms:
         try:
             term_idx = self.terms.index(term)
         except:
             # Term not seen before, skip
             continue
         #print term, self.terms.index(term)
         if frequency:
             my_query_vector.increment_after_padding(term_idx,1)
         else:
             my_query_vector.insert_after_padding(term_idx,1)
     return my_query_vector
예제 #8
0
 def add_doc(self, doc_id="", doc_class="", doc_terms=[], frequency=False, do_padding=False):
     """ Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles; string and frequencies
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
     """
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     # Discard anything not in whitelist if it is not empty
     if self.whitelist:
         doc_terms = [t for t in doc_terms if t in self.whitelist]
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx, term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx, 1)
             else:
                 my_doc_terms.insert_after_padding(term_idx, 1)
     # In the rare event when whitelisting causes an empty doc_terms list
     # We add at least one zero in the list of my_doc_terms
     if not my_doc_terms:
         zeros = [float(0)] * len(self.vocabulary())
         my_doc_terms = SuperList(zeros)
     self.docs.append({"id": doc_id, "class": doc_class, "terms": my_doc_terms})
     # Update list of document classes if new class seen.
     # self.classes.unique_append(doc_class)
     # if self.classes.has_key(doc_class):
     # else:
     #    self.classes[doc_class].add(my_doc_terms)
     #    self.classes[doc_class] = my_doc_terms
     if do_padding:
         self.do_padding()
예제 #9
0
파일: matrix.py 프로젝트: KemingChen/IR
 def add_doc(self, doc_id = '', doc_class='', doc_terms=[], 
             frequency=False, do_padding=False, 
             unique_ids=False, stopwords=[]):
     ''' Add new document to our matrix:
         doc_id: Identifier for the document, eg. file name, url, etc. 
         doc_class: You might need this in classification.
         doc_terms: List of terms you got after tokenizing the document.
                    Terms can be typles; string and frequencies
         frequency: If true, term occurences is incremented by one.
                     Else, occurences is only 0 or 1 (a la Bernoulli)
         do_padding: Boolean. Check do_padding() for more info.
         unique_ids: When true, if two documents are added with same id,
                     then their terms are summed up into only one record.
         stopwords: If not empty, ignore those stop words in doc_terms 
     ''' 
     # Update list of terms if new term seen.
     # And document (row) with its associated data.
     my_doc_terms = SuperList()
     # Discard anything not in whitelist if it is not empty
     if self.whitelist:
         doc_terms = [t for t in doc_terms if t in self.whitelist]
     # Discard anything in stopwords if not empty
     if stopwords:
         doc_terms = [t for t in doc_terms if t not in stopwords]
     for term in doc_terms:
         if type(term) == tuple:
             term_idx = self.terms.unique_append(term[0])
             my_doc_terms.increment_after_padding(term_idx,term[1])
         else:
             term_idx = self.terms.unique_append(term)
             if frequency:
                 my_doc_terms.increment_after_padding(term_idx,1)
             else:
                 my_doc_terms.insert_after_padding(term_idx,1)
     # In the rare event when whitelisting causes an empty doc_terms list
     # We add at least one zero in the list of my_doc_terms
     if not my_doc_terms:
         zeros = [float(0)] * len(self.vocabulary())
         my_doc_terms = SuperList(zeros)
         
         
     if unique_ids:
         found = 0
         for doc in self.docs:
             if doc['id'] == doc_id:
                 doc['terms'].add(my_doc_terms)
                 found = 1
         if not found:        
             self.docs.append({'id': doc_id, 
                               'class': doc_class, 
                               'terms': my_doc_terms}) 
     else:
         self.docs.append({  'id': doc_id, 
                             'class': doc_class, 
                             'terms': my_doc_terms})
     # Update list of document classes if new class seen.
     #self.classes.unique_append(doc_class)
     #if self.classes.has_key(doc_class):
     #else:
     #    self.classes[doc_class].add(my_doc_terms)
     #    self.classes[doc_class] = my_doc_terms
     if do_padding: 
         self.do_padding()
예제 #10
0
파일: matrix.py 프로젝트: Zhiyu-Chen/irlib
    def add_doc(self, doc_id='', 
                doc_class='', 
                doc_terms=[], 
                frequency=False, 
                do_padding=False, 
                unique_ids=False,
                meta_data={}):
        ''' Add new document to our matrix:
            doc_id: Identifier for the document, eg. file name, url, etc. 
            doc_class: You might need this in classification.
            doc_terms: List of terms you got after tokenizing the document.
                       Terms can be typles; string and frequencies
            frequency: If true, term occurences is incremented by one.
                        Else, occurences is only 0 or 1 (a la Bernoulli)
            do_padding: Boolean. Check do_padding() for more info.
            unique_ids: When true, if two documents are added with same id,
                        then their terms are summed up into only one record.
            meta_data: More fields to add to the document, for your own use.
        ''' 
        if not doc_terms:
            raise ValueError('doc_terms cannot be empty')
        # Update list of terms if new term seen.
        # And document (row) with its associated data.
        my_doc_terms = SuperList()
        # Discard anything not in whitelist if it is not empty
        if self.whitelist:
            doc_terms = [t for t in doc_terms if t in self.whitelist]
        # Discard anything in stopwords if not empty
        if self.blacklist: 
            doc_terms = [t for t in doc_terms if t not in self.blacklist]
        for term in doc_terms:
            if type(term) == tuple:
                term_idx = self.terms.unique_append(term[0])
                my_doc_terms.increment_after_padding(term_idx,term[1])
            else:
                term_idx = self.terms.unique_append(term)
                if frequency:
                    my_doc_terms.increment_after_padding(term_idx,1)
                else:
                    my_doc_terms.insert_after_padding(term_idx,1)
        # In the rare event when whitelisting causes an empty doc_terms list
        # We add at least one zero in the list of my_doc_terms
        if not my_doc_terms:
            zeros = [float(0)] * len(self.vocabulary())
            my_doc_terms = SuperList(zeros)
         
        doc_data = {
            'id': doc_id, 
            'class': doc_class, 
            'terms': my_doc_terms
        }

        for key in meta_data:
            doc_data[key] = meta_data[key]

        if unique_ids:
            self.docs.add_unique(doc_data)              
        else:
            self.docs.append(doc_data)

        if do_padding: 
            self.do_padding()