def extract_keywords(rec):
    #if 'kw' not in rec:
        #print "No keywords in rec:", rec
    #    return []
        #d_parsed = d[0].split(':')
        #if d_parsed[0] == 'registration':
        #    date_reg = d_parsed[1]
    #return reduce(lambda a, b: a+b, map(lambda x: text2words.text_to_words(x[0]), rec['kw']))
    return map(lambda x: text2words.text_to_words(x[0]), rec['kw'])
示例#2
0
def extract_keywords(rec):
    #if 'kw' not in rec:
    #print "No keywords in rec:", rec
    #    return []
    #d_parsed = d[0].split(':')
    #if d_parsed[0] == 'registration':
    #    date_reg = d_parsed[1]
    #return reduce(lambda a, b: a+b, map(lambda x: text2words.text_to_words(x[0]), rec['kw']))
    return map(lambda x: text2words.text_to_words(x[0]), rec['kw'])
def calc_word_feats(s, words):
    """Calculate number of occurences of words in s"""
    occurences = {}
    for w in words:
        occurences[w] = 0

    for w in text2words.text_to_words(s):
        if w in occurences:
            occurences[w] += 1

    return [occurences[w] for w in words]
def calc_word_feats(s, words):
    """Calculate number of occurences of words in s"""
    occurences = {}
    for w in words:
        occurences[w] = 0
    
    for w in text2words.text_to_words(s):
        if w in occurences:
            occurences[w] += 1
            
    return [occurences[w] for w in words]
示例#5
0
    def get_tokens(self, text):
        """Break a string into tokens, preserving URL tags as an entire token.

             This implementation does not preserve case.    
             Clients may wish to override this behavior with their own tokenization.
        """
        #return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())from __future__ import absolute_import
        import os, sys
        lib_path = os.path.abspath(os.path.sep.join(['..', '..', '..', 'document_classification']))
        sys.path.append(lib_path)
        from tools import text2words
        return text2words.text_to_words(text)
示例#6
0
def extract_categs_msc(rec):
    if 'categories' not in rec:
        return None

    new_categs = []
    for categ in rec['categories']:
        categ_parsed = categ[0].replace("bwmeta1.category-class.", '')
        categ_parsed = categ_parsed.split(':')
        if categ_parsed[0] == 'MSC':
            new_categs.append(text2words.text_to_words(categ_parsed[1]))

    return new_categs
def extract_categs_msc(rec):
    if 'categories' not in rec:
        return None
    
    new_categs = []
    for categ in rec['categories']:
        categ_parsed = categ[0].replace("bwmeta1.category-class.", '')
        categ_parsed = categ_parsed.split(':')    
        if categ_parsed[0] == 'MSC':
            new_categs.append(text2words.text_to_words(categ_parsed[1]))
            
    return new_categs
示例#8
0
    def get_tokens(self, text):
        """Break a string into tokens, preserving URL tags as an entire token.

             This implementation does not preserve case.    
             Clients may wish to override this behavior with their own tokenization.
        """
        #return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())from __future__ import absolute_import
        import os, sys
        lib_path = os.path.abspath(
            os.path.sep.join(['..', '..', '..', 'document_classification']))
        sys.path.append(lib_path)
        from tools import text2words
        return text2words.text_to_words(text)
示例#9
0
def words_freqs(records, features):
    #count each word in abstract and title:
    all_w = 0
    words = defaultdict(lambda: 0)
    for rec in gen_record(records, features+['mc']):
        words_l = text_to_words(" ".join([rec[f] for f in features]) )
        for c in words_l:
            words[c]+=1
            all_w+=1
    print "count of all words:", all_w
    print "words found:"
    w_sorted = sorted(list(words.iteritems()), key=lambda x:x[1], reverse=True)
    for k, v in w_sorted:
        print k, ":", v
示例#10
0
def print_words_freqs(filename):
    #count each word in abstract and title:
    words = defaultdict(lambda: 0)
    all_w=0
    with open(filename) as f:
        for line in f:
            words_l = text_to_words(line)
            for c in words_l:
                words[c]+=1
                all_w+=1
    print "count of all words:", all_w
    print "words found:"
    w_sorted = sorted(list(words.iteritems()), key=lambda x:x[1], reverse=True)
    for k, v in w_sorted:
        print k, ":", v