Python text_to_words示例

编程语言: Python

命名空间/包名称: tools.text2words

方法/功能: text_to_words

hotexamples.com的示例: 10

Python text_to_words - 已找到10个示例。这些是从开源项目中提取的最受好评的tools.text2words.text_to_words现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： record_convert.py 项目： pszostek/research-python-backup

def extract_keywords(rec):
    #if 'kw' not in rec:
        #print "No keywords in rec:", rec
    #    return []
        #d_parsed = d[0].split(':')
        #if d_parsed[0] == 'registration':
        #    date_reg = d_parsed[1]
    #return reduce(lambda a, b: a+b, map(lambda x: text2words.text_to_words(x[0]), rec['kw']))
    return map(lambda x: text2words.text_to_words(x[0]), rec['kw'])

示例#2

显示文件

def extract_keywords(rec):
    #if 'kw' not in rec:
    #print "No keywords in rec:", rec
    #    return []
    #d_parsed = d[0].split(':')
    #if d_parsed[0] == 'registration':
    #    date_reg = d_parsed[1]
    #return reduce(lambda a, b: a+b, map(lambda x: text2words.text_to_words(x[0]), rec['kw']))
    return map(lambda x: text2words.text_to_words(x[0]), rec['kw'])

示例#3

显示文件

文件： records_to_words_weights_converter.py 项目： pszostek/research-python-backup

def calc_word_feats(s, words):
    """Calculate number of occurences of words in s"""
    occurences = {}
    for w in words:
        occurences[w] = 0

    for w in text2words.text_to_words(s):
        if w in occurences:
            occurences[w] += 1

    return [occurences[w] for w in words]

示例#4

显示文件

文件： records_to_words_weights_converter.py 项目： pszostek/research-python-backup

def calc_word_feats(s, words):
    """Calculate number of occurences of words in s"""
    occurences = {}
    for w in words:
        occurences[w] = 0
    
    for w in text2words.text_to_words(s):
        if w in occurences:
            occurences[w] += 1
            
    return [occurences[w] for w in words]

示例#5

显示文件

文件： tfidf.py 项目： pszostek/research-python-backup

    def get_tokens(self, text):
        """Break a string into tokens, preserving URL tags as an entire token.

             This implementation does not preserve case.    
             Clients may wish to override this behavior with their own tokenization.
        """
        #return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())from __future__ import absolute_import
        import os, sys
        lib_path = os.path.abspath(os.path.sep.join(['..', '..', '..', 'document_classification']))
        sys.path.append(lib_path)
        from tools import text2words
        return text2words.text_to_words(text)

示例#6

显示文件

def extract_categs_msc(rec):
    if 'categories' not in rec:
        return None

    new_categs = []
    for categ in rec['categories']:
        categ_parsed = categ[0].replace("bwmeta1.category-class.", '')
        categ_parsed = categ_parsed.split(':')
        if categ_parsed[0] == 'MSC':
            new_categs.append(text2words.text_to_words(categ_parsed[1]))

    return new_categs

示例#7

显示文件

文件： record_convert.py 项目： pszostek/research-python-backup

def extract_categs_msc(rec):
    if 'categories' not in rec:
        return None
    
    new_categs = []
    for categ in rec['categories']:
        categ_parsed = categ[0].replace("bwmeta1.category-class.", '')
        categ_parsed = categ_parsed.split(':')    
        if categ_parsed[0] == 'MSC':
            new_categs.append(text2words.text_to_words(categ_parsed[1]))
            
    return new_categs

示例#8

显示文件

文件： tfidf.py 项目： pszostek/research-python-backup

    def get_tokens(self, text):
        """Break a string into tokens, preserving URL tags as an entire token.

             This implementation does not preserve case.    
             Clients may wish to override this behavior with their own tokenization.
        """
        #return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())from __future__ import absolute_import
        import os, sys
        lib_path = os.path.abspath(
            os.path.sep.join(['..', '..', '..', 'document_classification']))
        sys.path.append(lib_path)
        from tools import text2words
        return text2words.text_to_words(text)

示例#9

显示文件

def words_freqs(records, features):
    #count each word in abstract and title:
    all_w = 0
    words = defaultdict(lambda: 0)
    for rec in gen_record(records, features+['mc']):
        words_l = text_to_words(" ".join([rec[f] for f in features]) )
        for c in words_l:
            words[c]+=1
            all_w+=1
    print "count of all words:", all_w
    print "words found:"
    w_sorted = sorted(list(words.iteritems()), key=lambda x:x[1], reverse=True)
    for k, v in w_sorted:
        print k, ":", v

示例#10

显示文件

def print_words_freqs(filename):
    #count each word in abstract and title:
    words = defaultdict(lambda: 0)
    all_w=0
    with open(filename) as f:
        for line in f:
            words_l = text_to_words(line)
            for c in words_l:
                words[c]+=1
                all_w+=1
    print "count of all words:", all_w
    print "words found:"
    w_sorted = sorted(list(words.iteritems()), key=lambda x:x[1], reverse=True)
    for k, v in w_sorted:
        print k, ":", v