示例#1
0
def main(docids, directory):
    good_cnt = 0
    for i, id_ in enumerate(docids):
        if i % 1000 == 0:
            logger.info('{}/{}/{}'.format(good_cnt, i, len(docids)))

        path = os.path.join(directory, id_)
        titles, _ = separate_title_from_body(path + '.auxil', path + '.paf')
        tokens = [t['token'] for t in titles[0]['features']]
        if not is_monocase(tokens):
            print(id_)
            good_cnt += 1
def main(docids, directory):
    good_cnt = 0
    for i, id_ in enumerate(docids):
        if i % 1000 == 0:
            logger.info('{}/{}/{}'.format(good_cnt, i, len(docids)))

        path = os.path.join(directory, id_)
        titles, _ = separate_title_from_body(path + '.auxil', path + '.paf')
        tokens = [t['token']
                  for t in titles[0]['features']]
        if not is_monocase(tokens):
            print(id_)
            good_cnt += 1
示例#3
0
def main():
    """print title each per one line from the corpus"""

    year = 2014
    # months = ['01', '02', '03', '04', '05', '06', '07']  # 2015-08-05
    months = range(11, 13)
    # months = ['02'] # 2015-08-13
    # months = ['02', '03', '04', '05'], 2015-08-05
    # months = ['03']  # 2015-08-13

    days = xrange(1, 32)
    paths = [
        '/cs/puls/Corpus/Business/Puls/{}/{}/{:2d}/'.format(year, month, day)
        for month in months for day in days
    ]

    collected = 0
    for i, fname in enumerate(get_file_names(paths)):
        if i % 100 == 0:
            logger.info("{} / {}".format(collected, i))

        try:
            title = extract_title(fname)
        except:
            logger.debug('Fail to find title')
            continue

        if not title:  # no title
            continue

        title = normalize_title(title)

        # is not monocase and is English
        if not is_monocase(nltk.word_tokenize(title)) and\
           guessLanguage(title) == "en":
            body = get_document_content_paf(fname)
            if len(body.strip()) > 0:  # non-empty
                collected += 1
                print json.dumps([fname, unicode(title).encode("utf8")])
def main():
    """print title each per one line from the corpus"""
    
    year = 2014
    # months = ['01', '02', '03', '04', '05', '06', '07']  # 2015-08-05
    months = range(11, 13)
    # months = ['02'] # 2015-08-13
    # months = ['02', '03', '04', '05'], 2015-08-05
    # months = ['03']  # 2015-08-13
    
    days = xrange(1, 32)
    paths = ['/cs/puls/Corpus/Business/Puls/{}/{}/{:2d}/'.format(year, month, day)
             for month in months
             for day in days]

    collected = 0
    for i, fname in enumerate(get_file_names(paths)):
        if i % 100 == 0:
            logger.info("{} / {}".format(collected, i))

        try:
            title = extract_title(fname)
        except:
            logger.debug('Fail to find title')
            continue

        if not title:  # no title
            continue
            
        title = normalize_title(title)
        
        # is not monocase and is English
        if not is_monocase(nltk.word_tokenize(title)) and\
           guessLanguage(title) == "en":
            body = get_document_content_paf(fname)
            if len(body.strip()) > 0:  # non-empty
                collected += 1
                print json.dumps([fname, unicode(title).encode("utf8")])