def extract_keywords(rec): #if 'kw' not in rec: #print "No keywords in rec:", rec # return [] #d_parsed = d[0].split(':') #if d_parsed[0] == 'registration': # date_reg = d_parsed[1] #return reduce(lambda a, b: a+b, map(lambda x: text2words.text_to_words(x[0]), rec['kw'])) return map(lambda x: text2words.text_to_words(x[0]), rec['kw'])
def calc_word_feats(s, words): """Calculate number of occurences of words in s""" occurences = {} for w in words: occurences[w] = 0 for w in text2words.text_to_words(s): if w in occurences: occurences[w] += 1 return [occurences[w] for w in words]
def get_tokens(self, text): """Break a string into tokens, preserving URL tags as an entire token. This implementation does not preserve case. Clients may wish to override this behavior with their own tokenization. """ #return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())from __future__ import absolute_import import os, sys lib_path = os.path.abspath(os.path.sep.join(['..', '..', '..', 'document_classification'])) sys.path.append(lib_path) from tools import text2words return text2words.text_to_words(text)
def extract_categs_msc(rec): if 'categories' not in rec: return None new_categs = [] for categ in rec['categories']: categ_parsed = categ[0].replace("bwmeta1.category-class.", '') categ_parsed = categ_parsed.split(':') if categ_parsed[0] == 'MSC': new_categs.append(text2words.text_to_words(categ_parsed[1])) return new_categs
def get_tokens(self, text): """Break a string into tokens, preserving URL tags as an entire token. This implementation does not preserve case. Clients may wish to override this behavior with their own tokenization. """ #return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())from __future__ import absolute_import import os, sys lib_path = os.path.abspath( os.path.sep.join(['..', '..', '..', 'document_classification'])) sys.path.append(lib_path) from tools import text2words return text2words.text_to_words(text)
def words_freqs(records, features): #count each word in abstract and title: all_w = 0 words = defaultdict(lambda: 0) for rec in gen_record(records, features+['mc']): words_l = text_to_words(" ".join([rec[f] for f in features]) ) for c in words_l: words[c]+=1 all_w+=1 print "count of all words:", all_w print "words found:" w_sorted = sorted(list(words.iteritems()), key=lambda x:x[1], reverse=True) for k, v in w_sorted: print k, ":", v
def print_words_freqs(filename): #count each word in abstract and title: words = defaultdict(lambda: 0) all_w=0 with open(filename) as f: for line in f: words_l = text_to_words(line) for c in words_l: words[c]+=1 all_w+=1 print "count of all words:", all_w print "words found:" w_sorted = sorted(list(words.iteritems()), key=lambda x:x[1], reverse=True) for k, v in w_sorted: print k, ":", v