Python text_to_words 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: tools.text2words

메소드/함수: text_to_words

hotexamples.com에서의 예제들: 10

Python text_to_words - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 tools.text2words.text_to_words에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: record_convert.py 프로젝트: pszostek/research-python-backup

def extract_keywords(rec):
    #if 'kw' not in rec:
        #print "No keywords in rec:", rec
    #    return []
        #d_parsed = d[0].split(':')
        #if d_parsed[0] == 'registration':
        #    date_reg = d_parsed[1]
    #return reduce(lambda a, b: a+b, map(lambda x: text2words.text_to_words(x[0]), rec['kw']))
    return map(lambda x: text2words.text_to_words(x[0]), rec['kw'])

예제 #2

파일 보기

def extract_keywords(rec):
    #if 'kw' not in rec:
    #print "No keywords in rec:", rec
    #    return []
    #d_parsed = d[0].split(':')
    #if d_parsed[0] == 'registration':
    #    date_reg = d_parsed[1]
    #return reduce(lambda a, b: a+b, map(lambda x: text2words.text_to_words(x[0]), rec['kw']))
    return map(lambda x: text2words.text_to_words(x[0]), rec['kw'])

예제 #3

파일 보기

파일: records_to_words_weights_converter.py 프로젝트: pszostek/research-python-backup

def calc_word_feats(s, words):
    """Calculate number of occurences of words in s"""
    occurences = {}
    for w in words:
        occurences[w] = 0

    for w in text2words.text_to_words(s):
        if w in occurences:
            occurences[w] += 1

    return [occurences[w] for w in words]

예제 #4

파일 보기

파일: records_to_words_weights_converter.py 프로젝트: pszostek/research-python-backup

def calc_word_feats(s, words):
    """Calculate number of occurences of words in s"""
    occurences = {}
    for w in words:
        occurences[w] = 0
    
    for w in text2words.text_to_words(s):
        if w in occurences:
            occurences[w] += 1
            
    return [occurences[w] for w in words]

예제 #5

파일 보기

파일: tfidf.py 프로젝트: pszostek/research-python-backup

    def get_tokens(self, text):
        """Break a string into tokens, preserving URL tags as an entire token.

             This implementation does not preserve case.    
             Clients may wish to override this behavior with their own tokenization.
        """
        #return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())from __future__ import absolute_import
        import os, sys
        lib_path = os.path.abspath(os.path.sep.join(['..', '..', '..', 'document_classification']))
        sys.path.append(lib_path)
        from tools import text2words
        return text2words.text_to_words(text)

예제 #6

파일 보기

def extract_categs_msc(rec):
    if 'categories' not in rec:
        return None

    new_categs = []
    for categ in rec['categories']:
        categ_parsed = categ[0].replace("bwmeta1.category-class.", '')
        categ_parsed = categ_parsed.split(':')
        if categ_parsed[0] == 'MSC':
            new_categs.append(text2words.text_to_words(categ_parsed[1]))

    return new_categs

예제 #7

파일 보기

파일: record_convert.py 프로젝트: pszostek/research-python-backup

def extract_categs_msc(rec):
    if 'categories' not in rec:
        return None
    
    new_categs = []
    for categ in rec['categories']:
        categ_parsed = categ[0].replace("bwmeta1.category-class.", '')
        categ_parsed = categ_parsed.split(':')    
        if categ_parsed[0] == 'MSC':
            new_categs.append(text2words.text_to_words(categ_parsed[1]))
            
    return new_categs

예제 #8

파일 보기

파일: tfidf.py 프로젝트: pszostek/research-python-backup

    def get_tokens(self, text):
        """Break a string into tokens, preserving URL tags as an entire token.

             This implementation does not preserve case.    
             Clients may wish to override this behavior with their own tokenization.
        """
        #return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())from __future__ import absolute_import
        import os, sys
        lib_path = os.path.abspath(
            os.path.sep.join(['..', '..', '..', 'document_classification']))
        sys.path.append(lib_path)
        from tools import text2words
        return text2words.text_to_words(text)

예제 #9

파일 보기

def words_freqs(records, features):
    #count each word in abstract and title:
    all_w = 0
    words = defaultdict(lambda: 0)
    for rec in gen_record(records, features+['mc']):
        words_l = text_to_words(" ".join([rec[f] for f in features]) )
        for c in words_l:
            words[c]+=1
            all_w+=1
    print "count of all words:", all_w
    print "words found:"
    w_sorted = sorted(list(words.iteritems()), key=lambda x:x[1], reverse=True)
    for k, v in w_sorted:
        print k, ":", v

예제 #10

파일 보기

def print_words_freqs(filename):
    #count each word in abstract and title:
    words = defaultdict(lambda: 0)
    all_w=0
    with open(filename) as f:
        for line in f:
            words_l = text_to_words(line)
            for c in words_l:
                words[c]+=1
                all_w+=1
    print "count of all words:", all_w
    print "words found:"
    w_sorted = sorted(list(words.iteritems()), key=lambda x:x[1], reverse=True)
    for k, v in w_sorted:
        print k, ":", v