def prune(self, prune_map, show_progress=True): ''' Helper method to remove terms (fields) of our matrix prune_map is a list of 0's and 1's of same length as self.terms. For each term, if 0, then remove it, otherwise keep it. ''' if not(prune_map) or len(prune_map) != len(self.terms): return False if show_progress: print ' Pruning terms list ...' new_terms = SuperList() for i in range(len(prune_map)-1,-1,-1): if prune_map[i] == 1: #print self.terms[i] new_terms.append(self.terms[i]) self.terms = new_terms if show_progress: print ' Pruning documents ...' p = Progress(n=len(self), percent=10) for doc in self.docs: new_doc_terms = SuperList() for i in range(len(prune_map)-1,-1,-1): if prune_map[i] == 1: new_doc_terms.append(doc['terms'][i]) doc['terms'] = new_doc_terms if show_progress: p.show(message=' Pruning progress:')
def add_doc(self, doc_id='', doc_class='', doc_terms=[], frequency=False, do_padding=False, unique_ids=False, meta_data={}): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. unique_ids: When true, if two documents are added with same id, then their terms are summed up into only one record. meta_data: More fields to add to the document, for your own use. ''' if not doc_terms: raise ValueError('doc_terms cannot be empty') # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] # Discard anything in stopwords if not empty if self.blacklist: doc_terms = [t for t in doc_terms if t not in self.blacklist] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx, term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx, 1) else: my_doc_terms.insert_after_padding(term_idx, 1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) doc_data = {'id': doc_id, 'class': doc_class, 'terms': my_doc_terms} for key in meta_data: doc_data[key] = meta_data[key] if unique_ids: self.docs.add_unique(doc_data) else: self.docs.append(doc_data) if do_padding: self.do_padding()
def load(self, filename, delimiter='\t', header=True): ''' Loads matrix from CSV/TSV file ''' with open(filename, 'r') as fd: header_line = fd.readline() header_data = header_line.strip().split(delimiter) # First 2 columns are id and class self.terms = SuperList(header_data[2:]) for line in fd.readlines(): doc_data = { 'id': line[0], 'class': line[1], 'terms': SuperList(line[2:]) } self.docs.append(doc_data) fd.close()
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() for term in doc_terms: term_idx = self.terms.unique_append(term) #my_doc_terms.insert_after_padding(self.terms.index(term)) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) if self.classes.has_key(doc_class): self.classes[doc_class].add(my_doc_terms) else: self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Check do_padding() for more info. ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) # In the rare event when whitelisting causes an empty doc_terms list # We add at least one zero in the list of my_doc_terms if not my_doc_terms: zeros = [float(0)] * len(self.vocabulary()) my_doc_terms = SuperList(zeros) self.docs.append({ 'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) # Update list of document classes if new class seen. #self.classes.unique_append(doc_class) #if self.classes.has_key(doc_class): #else: # self.classes[doc_class].add(my_doc_terms) # self.classes[doc_class] = my_doc_terms if do_padding: self.do_padding()
def tf_idf(self, do_idf=True): ''' Converts matrix to tf.idf values do_idf: if False, convert to tf only ''' N = len(self.docs) df = SuperList([0] * len(self.terms)) for doc in self.docs: row = SuperList([0] * len(self.terms)) for idx in range(len(self.terms)): if doc['terms'][idx] > 0: row[idx] = 1 df.add(row) for doc in self.docs: for idx in range(len(self.terms)): tf = self._log_tf(doc['terms'][idx]) idf = math.log10(float(N) / df[idx]) if do_idf: doc['terms'][idx] = tf * idf else: doc['terms'][idx] = tf
def __init__(self, matrix): self.mx = matrix self.N = 0 self.classes = {} self.terms = SuperList() for c in self.mx.classes: self.classes[c] = {} self.classes[c]['terms'] = self.mx.classes[c] self.classes[c]['total'] = sum(self.classes[c]['terms']) self.terms.add(self.classes[c]['terms']) self.N += self.classes[c]['total'] self.mi_terms = []
def add_doc(self, doc_id = '', doc_class='', doc_terms=[], frequency=False, do_padding=False, stopwords=[]): ''' Add new document to our matrix: doc_id: Identifier for the document, eg. file name, url, etc. doc_class: You might need this in classification. doc_terms: List of terms you got after tokenizing the document. Terms can be typles; string and frequencies frequency: If true, term occurences is incremented by one. Else, occurences is only 0 or 1 (a la Bernoulli) do_padding: Boolean. Useless here stopwords: If not empty, ignore those stop words in doc_terms ''' # Update list of terms if new term seen. # And document (row) with its associated data. my_doc_terms = SuperList() # Discard anything not in whitelist if it is not empty if self.whitelist: doc_terms = [t for t in doc_terms if t in self.whitelist] # Discard anything in stopwords if not empty if stopwords: doc_terms = [t for t in doc_terms if t not in stopwords] for term in doc_terms: if type(term) == tuple: term_idx = self.terms.unique_append(term[0]) my_doc_terms.increment_after_padding(term_idx,term[1]) else: term_idx = self.terms.unique_append(term) if frequency: my_doc_terms.increment_after_padding(term_idx,1) else: my_doc_terms.insert_after_padding(term_idx,1) #self.docs.append({ 'id': doc_id, # 'class': doc_class, # 'terms': my_doc_terms}) found = 0 for doc in self.docs: if doc['class'] == doc_class: doc['terms'].add(my_doc_terms) found = 1 if not found: self.docs.append({'id': doc_id, 'class': doc_class, 'terms': my_doc_terms}) if do_padding: self.do_padding()
def __init__(self, whitelist=[]): ''' Initilize our matrix. whitelist: If not empty, discard any terms not in whitelist, when adding new terms via add_doc() terms: We will populate this with our vocabulary of terms docs: This is our actual 2D matrix terms/docs. A list of the following dictionary, { 'id': Unique ID to each document, 'class': In case of labeled data, doc class label, 'terms': list of 1's and 0's, i.e. term Frequencies. } ''' # List of unique terms (vocabulary) self.terms = SuperList() # List of document classes and terms summary #self.classes = {} self.docs = [] self.whitelist = whitelist
def query_to_vector(self, q_terms, frequency=False,): ''' Converts query to a list alligned with our self.terms. Terms not seen before will be ignored. q_terms: list of query terms frequency: return a multinomial or multivariate list? ''' my_query_vector = SuperList() my_query_vector.expand(new_len=len(self.terms)) for term in q_terms: try: term_idx = self.terms.index(term) except: # Term not seen before, skip continue #print term, self.terms.index(term) if frequency: my_query_vector.increment_after_padding(term_idx,1) else: my_query_vector.insert_after_padding(term_idx,1) return my_query_vector
def to_be_deleted__getitem__(self, term): 'Returns occurences of term in all documents' if not term in self: return SuperList() col = [doc['terms'][self.terms.index(term)] for doc in self.docs] return SuperList(col)
def __init__(self): # List of unique terms (vocabulary) self.terms = SuperList() # List of document classes and terms summary self.classes = {} self.docs = []