Exemplo n.º 1
0
def test_split_camel_case():
    eq_(split_camel_case('BlaBla'), 'Bla Bla')
    eq_(split_camel_case('Bla'), 'Bla')
    eq_(split_camel_case('iBla'), 'iBla')
    eq_(split_camel_case('iBlaBla'), 'iBla Bla')
    eq_(split_camel_case('BlaBlaBlaaa'), 'Bla Bla Blaaa')
    eq_(split_camel_case('iBlaBla BlaaaBla'), 'iBla Bla  Blaaa Bla')
Exemplo n.º 2
0
def extract_text(doc, title_weight=None, header_weights=None, use_pdf=True,
                 use_stemmer=False, ukkonen_len=0):
    """ Extracts cleaned text from an HTML or PDF. """
    if is_pdf(doc):
        if use_pdf:
            try:
                text = extract_text_pdf(doc)
            except:
                # TODO: Nice error handling
                return ''
        else:
            return ''
    else:
        text = extract_text_html(
            str2unicode(doc),
            title_weight=title_weight,
            header_weights=header_weights)
    # Replace newlines etc.
    text = re.sub('\s+', ' ', text)
    # Replace Umlaute and apply 'unidecode'
    text = clean_text(text)
    # Remove punctuation
    replace_punctuation = string.maketrans(
        string.punctuation, ' ' * len(string.punctuation))
    text = text.translate(replace_punctuation)
    # Remove multiple spaces
    text = re.sub(' +', ' ', text)
    # Strip
    text = text.strip()
    # Split camel case words
    text = ' '.join([split_camel_case(word) for word in text.split(' ')])
    # Remove digits
    text = ''.join([c for c in text if not c.isdigit()])
    # Remove single characters
    text = ' '.join([word for word in text.split(' ') if len(word) > 1])
    # Remove multiple spaces
    text = re.sub(' +', ' ', text)
    # Stem
    if use_stemmer:
        text = apply_stemmer(text)
    # Remove long repeated strings
    if ukkonen_len:
        text = remove_repeated_long_strings(text, ukkonen_len)
    return text.lower()