示例#1
0
 def get_text(soup:bs4.BeautifulSoup) -> str:
     "get the raw text"
     text = soup.getText()
     #translate newlines back from BigQuery
     text = re.sub(r'\n\n+', '\n', text)
     #translate double quotes back from BigQuery
     text = re.sub(r'xxxdblqte', ' \" ', text)
     return normalize_whitespace(text)
示例#2
0
 def _clean_content(self, content):
     # strip out link markup, e.g. [foo](http://foo.com)
     content = REDDIT_LINK_RE.sub(r'\1', content)
     # clean up basic HTML cruft
     content = content.replace('&gt;', '>').replace('&lt;', '<')
     # strip out text markup, e.g. * for bold text
     content = content.replace('`', '').replace('*', '').replace('~', '')
     # normalize whitespace
     return preprocess.normalize_whitespace(content)
示例#3
0
 def _clean_content(self, content):
     # strip out link markup, e.g. [foo](http://foo.com)
     content = REDDIT_LINK_RE.sub(r'\1', content)
     # clean up basic HTML cruft
     content = content.replace('&gt;', '>').replace('&lt;', '<')
     # strip out text markup, e.g. * for bold text
     content = content.replace('`', '').replace('*', '').replace('~', '')
     # normalize whitespace
     return normalize_whitespace(content)
示例#4
0
def clean_sentence(sentences):
    c = sentences.replace('-', ' ')  # people use to concatinate words
    c = normalize_whitespace(c)
    c = preprocess_text(c,
                        lowercase=True,
                        no_numbers=True,
                        no_punct=True,
                        no_contractions=True)
    return c
示例#5
0
 def _clean_content(self, content):
     # strip out link markup, e.g. [foo](http://foo.com)
     content = REDDIT_LINK_RE.sub(r"\1", content)
     # clean up basic HTML cruft
     content = content.replace("&gt;", ">").replace("&lt;", "<")
     # strip out text markup, e.g. * for bold text
     content = content.replace("`", "").replace("*", "").replace("~", "")
     # normalize whitespace
     return normalize_whitespace(content)
示例#6
0
    def clean_tweet(self, text):
        # FIXED UNICODE
        text = preprocess.fix_bad_unicode(text)

        # GET TEXT ONLY FROM HTML
        text = BeautifulSoup(text, features='lxml').getText()
        # UN-PACK CONTRACTIONS
        text = preprocess.unpack_contractions(text)

        # REMOVE URL
        text = preprocess.replace_urls(text)

        # REMOVE EMAILS
        text = preprocess.replace_emails(text)

        # REMOVE PHONE NUMBERS
        text = preprocess.replace_phone_numbers(text)

        # REMOVE NUMBERS
        text = preprocess.replace_numbers(text)

        # REMOVE CURRENCY
        text = preprocess.replace_currency_symbols(text)

        # REMOVE ACCENTS
        text = preprocess.remove_accents(text)

        # CONVERT EMOJIS TO TEXT
        words = text.split()
        reformed = [
            self.SMILEY[word] if word in self.SMILEY else word
            for word in words
        ]
        text = " ".join(reformed)
        text = emoji.demojize(text)
        text = text.replace(":", " ")
        text = ' '.join(text.split())

        # SPLIT ATTACHED WORDS
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text))

        # SPLIT UNDERSCORE WORDS
        text = text.replace('_', ' ')

        # REMOVE PUNCTUATION
        text = preprocess.remove_punct(text)

        # Remove numbers
        text = re.sub(r'\d', '', text)

        # REMOVE WORDS LESS THAN 3 CHARACTERS
        text = re.sub(r'\b\w{1,2}\b', '', text)

        # NORMALIZE WHITESPACE
        text = preprocess.normalize_whitespace(text)

        return text
 def fulltext_extractor(self, d, clean_text=True):
     if 'fullText' in d:
         fulltext = d['fullText']
         if clean_text:
             fulltext = preprocess.normalize_whitespace(fulltext)
             fulltext = preprocess_text_by_config(fulltext,
                                                  self.textacy_defs)
         return fulltext
     else:
         return d
def preprocess(text, fix_unicode=True, normalize_white_space = False, lowercase=False, transliterate=False,
                    no_urls=False, no_emails=False, no_phone_numbers=False,
                    no_numbers=False, no_currency_symbols=False, no_punct=False,
                    no_contractions=False, no_accents=False):
    if normalize_white_space:
        text = pp.normalize_whitespace(text)
    text = pp.preprocess_text(text, fix_unicode, lowercase, transliterate,
                    no_urls, no_emails, no_phone_numbers,
                    no_numbers, no_currency_symbols, no_punct,
                    no_contractions, no_accents)
    return text
示例#9
0
def preprocess_sentence(sent):
    # TODO check language?
    s = preprocess.normalize_whitespace(sent)
    return preprocess.preprocess_text(s,
                                      lowercase=True,
                                      transliterate=True,
                                      no_urls=True,
                                      no_phone_numbers=True,
                                      no_numbers=True,
                                      no_currency_symbols=True,
                                      no_contractions=True,
                                      no_accents=True)
示例#10
0
def tokenizer(sentences):
    y = []
    if type(sentences) == str:
        sentences = [sentences]
    for comment in sentences:
        comment = my_preprocess(comment)
        txt = preprocess.normalize_whitespace(comment)

        txt = preprocess.preprocess_text(txt,
                                         fix_unicode=True,
                                         lowercase=True,
                                         transliterate=True,
                                         no_urls=True,
                                         no_emails=True,
                                         no_phone_numbers=True,
                                         no_numbers=True,
                                         no_currency_symbols=True,
                                         no_punct=True,
                                         no_contractions=True,
                                         no_accents=True)

        y.append(u''.join(txt))
    return y
示例#11
0
def question_mark_pos(line):
    """
       Get the list of positions of question mark in line
    """
    text = normalize_whitespace(line)
    token_pattern = r"\?{1,}\s{0,}\?{1,}"
    text = re.sub(token_pattern, r"?",
                  line)  # replace multi question marks with just one
    #print line
    token_pattern = r"(?u)\b\w+\b|\?"
    r = re.compile(token_pattern)
    word_qmark = r.findall(line)
    #print word_qmark
    pos_qmark = get_position_list("?", word_qmark)
    count_qmark = len(pos_qmark)
    if pos_qmark[0] == 0:
        count_word = len(word_qmark)
        pos_qmark = [len(word_qmark)]
    else:
        count_word = len(word_qmark) - count_qmark
        pos_qmark = [i - j for j, i in enumerate(pos_qmark, start=1)]
    pos_qmark = pos_qmark + [count_word]
    return pos_qmark
示例#12
0
def test_normalize_whitespace():
    text = "Hello, world!  Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. "
    proc_text = "Hello, world! Hello... world?\nHello:\nWorld."
    assert preprocess.normalize_whitespace(text) == proc_text
示例#13
0
def clean_text(text):
    text = re.sub('[^a-zA-Z0-9ßöäüÖÄÜ_.:,;?!()&@/€\- ]', "", text)
    text = pp.normalize_whitespace(text)

    return text
示例#14
0
 def test_normalize_whitespace(self):
     text = "Hello, world!  Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. "
     proc_text = "Hello, world! Hello... world?\nHello:\nWorld."
     self.assertEqual(preprocess.normalize_whitespace(text), proc_text)
示例#15
0
 def test_normalize_whitespace(self):
     text = "Hello, world!  Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. "
     proc_text = "Hello, world! Hello... world?\nHello:\nWorld."
     self.assertEqual(preprocess.normalize_whitespace(text), proc_text)
示例#16
0
def label(corpus, output, kg_url, test_size, numdocs=None):
    """ Generate fasttext compatible text files

    Single label version
    """

    docs = read_corpus(corpus)

    if numdocs:
        docs = docs[:numdocs]

    kg = get_lemmatized_kg(kg_url)

    X, y = prepare_corpus(docs, kg)

    by_labels = defaultdict(list)

    for doc, tls in zip(X, y):
        label, count = tls[0]
        by_labels[label].append((count, doc))

    counts = [len(v) for v in by_labels.values()]
    max_docs = min(counts)

    X, y = [], []

    for label, counteddocs in by_labels.items():
        docs = sorted(counteddocs, key=lambda d: d[0], reverse=True)
        docs = [d[1] for d in docs]
        docs = docs[:max_docs]
        X.extend(docs)
        y.extend([label] * max_docs)

    # yy = []
    # for tl in y:
    #     ls = [x[0].lower().replace(' ', '_') for x in tl]
    #     yy.append(ls)
    # y = yy

    X = [
        normalize_whitespace((' </s> '.join(sents)).replace('dignr', ''))
        for sents in X
    ]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=0)
    train_path = output + '-train'
    test_path = output + '-test'

    with open(train_path, 'w') as f:
        for label, text in zip(y_train, X_train):
            ls = '__label__' + label.replace(' ', '_').lower()
            # ls = ' '.join(['__label__{}'.format(l) for l in labels[:1]])
            line = "{} {}".format(ls, text)
            f.write(line)
            f.write('\n')

    with open(test_path, 'w') as f:
        for label, text in zip(y_test, X_test):
            ls = '__label__' + label.replace(' ', '_').lower()
            # ls = ' '.join(['__label__{}'.format(l) for l in labels[:1]])
            line = "{} {}".format(ls, text)
            f.write(line)
            f.write('\n')

    logger.info("Wrote train file: %s", train_path)
    logger.info("Wrote test file: %s", test_path)