def createCooccurrenceMatrix(stoplist, listOfText, windSize=1):
    vocab, numberOfVocab = createVocabulary(stoplist, listOfText)
    print(vocab)
    print(numberOfVocab)
    cooccurenceMat = []
    for x in range(len(vocab)):
        y = x + 1
        while y in range(len(vocab)):
            print(vocab[x], ' - ', vocab[y])
            row = [0] * len(vocab)
            for text in listOfText:
                text = cleaningText(stoplist, text)
                if vocab[x] in text and vocab[y] in text:
                    i = text.count(vocab[x])
                    index = text.index(vocab[x])
                    if i == 1 and index + windSize < len(text) and text[
                            index + windSize] == vocab[y]:
                        #row = [0]*len(vocab)
                        row[y] += 1
                        print(row)
                    elif i > 1:
                        row = [0] * len(vocab)
                        for k in range(len(text)):
                            if text[k] == vocab[x] and k + windSize < len(
                                    text) and text[k + windSize] == vocab[y]:
                                row[y] += 1
                        print(row)
            y += 1
            cooccurenceMat.append(row)

    return cooccurenceMat
Пример #2
0
    def transform(self, docs):
        stats_list = []
        translator = re.compile("[%s]" % re.escape(string.punctuation))

        for text in docs:
            # print(text)
            # fixed bug: TypeError: argument of type 'float' is not iterable (in unescape)
            if not isinstance(text, str):
                text = ""
            no_html = unescape(text)
            no_html_no_punct = translator.sub(" ", no_html)
            words = no_html_no_punct.split(" ")

            # bolded_text_type1 =

            # num_bold_text = text.count("<strong>") + text.count("<b>")
            num_dollar_signs = no_html.count("$")
            num_hashtags = no_html.count("#")
            num_paragraphs = (
                1
                + (
                    text.count("\\r\\n")
                    + text.count("\r\n")
                    + text.count("<br /><br />")
                    + text.count("<p>")
                ),
            )  # text.count("\n") + text.count("\n\n")
            num_sentences = text.count(".")
            num_words = len(words)

            stats = {
                # "num_bold_parts": num_bold_text,
                "num_dollar_signs": num_dollar_signs,
                "num_hashtags": num_hashtags,
                "num_paragraphs": num_paragraphs,
                "num_sentences": num_sentences,
                "num_words": num_words,
            }
            stats_list.append(stats)

        return stats_list
Пример #3
0
def count_twitter_objs(text):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.

    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|''[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    text = re.sub(space_pattern, ' ', text)
    text = re.sub(giant_url_regex, 'URLHERE', text)
    text = re.sub(mention_regex, 'MENTIONHERE', text)
    text = re.sub(hashtag_regex, 'HASHTAGHERE', text)
    return (text.count('URLHERE'), text.count('MENTIONHERE'), text.count('HASHTAGHERE'))
def get_hashtags_and_user_mentions(special_characters,
                                   text,
                                   wanted_characters=['#', '@']):
    # Identify hashtags, user mentions and remove urls
    results = {}
    for character in special_characters:
        text = re.sub('(' + character + ')+', ' ' + character, text)
        count_character = text.count(character)
        if count_character > 0:
            while count_character > 0:
                start = text.find(character)
                print(text.find(" ", start))
                print(text.find("\n", start))
                if text.find(" ", start) <= text.find("\n", start):
                    end = text.find(" ", start)
                else:
                    end = text.find("\n", start)
                if end == -1:
                    end = len(text)
                text_to_remove = text[start:end]
                print(text_to_remove)
                if len(text_to_remove) > 2:
                    if character in wanted_characters:
                        if character in results.keys():
                            results[character].append(text_to_remove)
                        else:
                            results[character] = [text_to_remove]
                text = text.replace(text_to_remove, "")
                text = ' '.join(text.split())
                count_character = text.count(character)
    for wanted_character in wanted_characters:
        if wanted_character not in results.keys():
            results[wanted_character] = []
    text = text.strip(' ')
    text = ' '.join(text.split())
    results['clean_text'] = text
    return results
Пример #5
0
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count / (len(text) - text.count(" ")), 3) * 100.
# text = "##kcacolombia https://t.co/9vgki1tgsu	apple re cago los emojis en ios10 beta 4\nhora de las noticias insolitas, mundo curioso en la papaya de oxigeno 100.4 \n#papayacuriosa"
# text = "@perezjhonatan17  no pienso discutir con alguien que no le gusta el aguacate pero si el jugo de mora y tomate, adios hombre horrible"
# text = "@Radioacktiva_ @juankiss67 Buena tarde @juankiss67 saludo desde el centro de Bogota, este integrante de la tropa te https://t.co/DrmuIKuqyS"
# text = "@ICETEX Buen dia. Cuando se realiza el desembolso del fondo para el acceso a educacion superior para victimas del conflicto armado? Gracias."
# text = "#almuerzo##dieta##comersaludable en En Algun Lugar Del Mundo https://t.co/0vTJafidwc"
# text = "tajada jajajaja #trabajosihay #lideres @xsalo_ @deportecali @shelsetatiana @d_ospina1"
text = text.lower()
print(text)
text = re.sub('(#)+', ' #', text)
print(text)
user_mentions = []
hashtags = []
characters = ["#", "@", "http"]
for character in characters:
    print(character)
    count_character = text.count(character)
    print(count_character)
    if count_character > 0:
        while count_character > 0:
            print(count_character)
            print(character)
            start = text.find(character)
            end = text.find(" ", start)
            if end == -1:
                end = len(text)
            text_to_remove = text[start:end]
            print(text.count(text_to_remove))
            print(text_to_remove)
            if character == "#":
                hashtags.append(text_to_remove)
            elif character == "@":