Python is_monocase示例

编程语言: Python

命名空间/包名称: util

方法/功能: is_monocase

hotexamples.com的示例: 4

Python is_monocase - 已找到4个示例。这些是从开源项目中提取的最受好评的util.is_monocase现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def main(docids, directory):
    good_cnt = 0
    for i, id_ in enumerate(docids):
        if i % 1000 == 0:
            logger.info('{}/{}/{}'.format(good_cnt, i, len(docids)))

        path = os.path.join(directory, id_)
        titles, _ = separate_title_from_body(path + '.auxil', path + '.paf')
        tokens = [t['token'] for t in titles[0]['features']]
        if not is_monocase(tokens):
            print(id_)
            good_cnt += 1

示例#2

显示文件

文件： filter_doc_ids.py 项目： xiaohan2012/capitalization-restoration-train

def main(docids, directory):
    good_cnt = 0
    for i, id_ in enumerate(docids):
        if i % 1000 == 0:
            logger.info('{}/{}/{}'.format(good_cnt, i, len(docids)))

        path = os.path.join(directory, id_)
        titles, _ = separate_title_from_body(path + '.auxil', path + '.paf')
        tokens = [t['token']
                  for t in titles[0]['features']]
        if not is_monocase(tokens):
            print(id_)
            good_cnt += 1

示例#3

显示文件

def main():
    """print title each per one line from the corpus"""

    year = 2014
    # months = ['01', '02', '03', '04', '05', '06', '07']  # 2015-08-05
    months = range(11, 13)
    # months = ['02'] # 2015-08-13
    # months = ['02', '03', '04', '05'], 2015-08-05
    # months = ['03']  # 2015-08-13

    days = xrange(1, 32)
    paths = [
        '/cs/puls/Corpus/Business/Puls/{}/{}/{:2d}/'.format(year, month, day)
        for month in months for day in days
    ]

    collected = 0
    for i, fname in enumerate(get_file_names(paths)):
        if i % 100 == 0:
            logger.info("{} / {}".format(collected, i))

        try:
            title = extract_title(fname)
        except:
            logger.debug('Fail to find title')
            continue

        if not title:  # no title
            continue

        title = normalize_title(title)

        # is not monocase and is English
        if not is_monocase(nltk.word_tokenize(title)) and\
           guessLanguage(title) == "en":
            body = get_document_content_paf(fname)
            if len(body.strip()) > 0:  # non-empty
                collected += 1
                print json.dumps([fname, unicode(title).encode("utf8")])

示例#4

显示文件

文件： print_filenames_and_titles.py 项目： xiaohan2012/capitalization-restoration-train

def main():
    """print title each per one line from the corpus"""
    
    year = 2014
    # months = ['01', '02', '03', '04', '05', '06', '07']  # 2015-08-05
    months = range(11, 13)
    # months = ['02'] # 2015-08-13
    # months = ['02', '03', '04', '05'], 2015-08-05
    # months = ['03']  # 2015-08-13
    
    days = xrange(1, 32)
    paths = ['/cs/puls/Corpus/Business/Puls/{}/{}/{:2d}/'.format(year, month, day)
             for month in months
             for day in days]

    collected = 0
    for i, fname in enumerate(get_file_names(paths)):
        if i % 100 == 0:
            logger.info("{} / {}".format(collected, i))

        try:
            title = extract_title(fname)
        except:
            logger.debug('Fail to find title')
            continue

        if not title:  # no title
            continue
            
        title = normalize_title(title)
        
        # is not monocase and is English
        if not is_monocase(nltk.word_tokenize(title)) and\
           guessLanguage(title) == "en":
            body = get_document_content_paf(fname)
            if len(body.strip()) > 0:  # non-empty
                collected += 1
                print json.dumps([fname, unicode(title).encode("utf8")])