示例#1
0
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = text.replace('-', ' ')
    text = text.replace("’s ", " ").replace("' ", " ")
    text = text.replace("s’ ", " ").replace("s' ",
                                            " ")  #remove proper possesives
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
def removestop(text):
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = text.lower().split()
    stops = ['so', 'his', 't', 'y', 'ours', 'herself', 'your', 'all', 
    'some', 'they', 'i', 'of', 'didn', 
    'them', 'when', 'will', 'that', 'its', 'because', 
    'while', 'those', 'my', 'don', 'again', 'her', 'if',
    'further', 'now', 'does', 'against', 'won', 'same', 
    'a', 'during', 'who', 'here', 'have', 'in', 'being', 
    'it', 'other', 'once', 'itself', 'hers', 'after', 're',
    'just', 'their', 'himself', 'theirs', 'whom', 'then', 'd', 
    'out', 'm', 'mustn', 'where', 'below', 'about', 'isn',
    'shouldn', 'wouldn', 'these', 'me', 'to', 'doesn', 'into',
    'the', 'until', 'she', 'am', 'under', 'how', 'yourself',
    'couldn', 'ma', 'up', 'than', 'from', 'themselves', 'yourselves',
    'off', 'above', 'yours', 'having', 'mightn', 'needn', 'on', 
    'too', 'there', 'an', 'and', 'down', 'ourselves', 'each',
    'hadn', 'ain', 'such', 've', 'did', 'be', 'or', 'aren', 'he', 
    'should', 'for', 'both', 'doing', 'this', 'through', 'do', 'had',
    'own', 'but', 'were', 'over', 'not', 'are', 'few', 'by', 
    'been', 'most', 'no', 'as', 'was', 'what', 's', 'is', 'you', 
    'shan', 'between', 'wasn', 'has', 'more', 'him', 'nor',
    'can', 'why', 'any', 'at', 'myself', 'very', 'with', 'we', 
    'which', 'hasn', 'weren', 'haven', 'our', 'll', 'only',
    'o', 'before']
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    text = text.replace("."," ").replace(","," ")
    return(text)
示例#3
0
def named_entities(text):
    '''
    Replaces all named entities
    before vectorization.
    '''
    for k, v in entities.items():
        text = text.replace(k, v)
    return text
示例#4
0
def remove_special_characters(text):
    '''
    Doesn't remove digits
    Input: "Well this was fun! What do you think?\n 123#@!__ 123_"
    Output: "Well this was fun What do you think\n 123 123"
    '''
    text = text.replace('&', 'and')
    pattern = r'[^a-zA-z0-9\s]|[_\^\\\`\[\]]'  #r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]' # [\s] == [ \t\n\r\f\v] True
    text = re.sub(pattern, '', text)
    return text
def corpus_specific_text_cleaning(text):
    """
    For performing corpus specific cleaning. Added to this file, since it needs to be adapted to the corpus and therefore a kind of configuration
    """
    text = text.replace('"full_text" : ', "").strip().replace('"', '').replace(
        '\\n*', ' ').replace('\\', ' ').replace('&amp',
                                                ' ').replace("'ve", ' have')
    text = text.replace("don't",
                        'do not').replace("doesn't", 'does not').replace(
                            "Don't", 'Do not').replace("Doesn't", 'Does not')
    text = text.replace("_NEWLINE_", " ").replace(
        "_CITATION_PREVIOUS_POST_PARAGRAPH",
        " ").replace("_CITATION_PREVIOUS_POST_", " ").replace("_POSTER_", " ")
    no_links = []
    for word in text.split(" "):
        if "//" not in word and "http" not in word and "@" not in word:
            no_links.append(word)
    cleaned_text = " ".join(no_links)
    return cleaned_text
示例#6
0
def remove_extra_meaningless_newlines(text):
    '''
    Input: "word1\r\n word2\t\r\n \r\n  \r\n   word3 \n  \r \n  \n    \n mst"
    Output: "word1\nword2\t\nword3 \nmst" -->removed leading whitespaces in the new line and extra \n also
    '''
    text_cln = text.replace('\r', '\n')
    while True:
        len_ini = len(text_cln)
        text_cln = re.sub('\\n +\\n', '\\n', text_cln)
        if (len(text_cln) == len_ini): break
    text_cln = '\n'.join([ele for ele in text_cln.split('\n') if len(ele) > 0])
    text_cln = re.sub('\\n *', '\\n', text_cln)
    # print(text_orig);print('-'*10);print(text)
    return text_cln
def _remove_personality_types(text: str) -> str:
    """Removes all mentions of personality types from test to avoid overfitting."""

    for ptype in config["types"]:
        if ptype in text or ptype.upper() in text:
            text = text.replace(ptype, "")
            text = text.replace(ptype + "s", "")
            text = text.replace(ptype + "'s", "")
            text = text.replace(ptype.upper(), "")
            text = text.replace(ptype.upper() + "s", "")
            text = text.replace(ptype.upper() + "'s", "")
    return text
示例#8
0
def clean_text(text, country):
    text = reduce(lambda a, kv: a.replace(*kv), contractions.items(),
                  text.lower())
    text = text.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
    text = strip_accents(text)
    text = text.translate(
        str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    tokens = tk.tokenize(text)
    if country == 'USA':
        stopwords = usa_stopwords
    elif country == 'Canada':
        stopwords = canada_stopwords
    elif country == 'UK':
        stopwords = britain_stopwords
    else:
        raise ValueError("Country is invalid.")
    tokens = [
        w for w in tokens
        if w not in stopwords and len(w) > 2 and w != ' ' and not w.isdigit()
    ]
    return ' '.join(tokens)
def get_hashtags_and_user_mentions(special_characters,
                                   text,
                                   wanted_characters=['#', '@']):
    # Identify hashtags, user mentions and remove urls
    results = {}
    for character in special_characters:
        text = re.sub('(' + character + ')+', ' ' + character, text)
        count_character = text.count(character)
        if count_character > 0:
            while count_character > 0:
                start = text.find(character)
                print(text.find(" ", start))
                print(text.find("\n", start))
                if text.find(" ", start) <= text.find("\n", start):
                    end = text.find(" ", start)
                else:
                    end = text.find("\n", start)
                if end == -1:
                    end = len(text)
                text_to_remove = text[start:end]
                print(text_to_remove)
                if len(text_to_remove) > 2:
                    if character in wanted_characters:
                        if character in results.keys():
                            results[character].append(text_to_remove)
                        else:
                            results[character] = [text_to_remove]
                text = text.replace(text_to_remove, "")
                text = ' '.join(text.split())
                count_character = text.count(character)
    for wanted_character in wanted_characters:
        if wanted_character not in results.keys():
            results[wanted_character] = []
    text = text.strip(' ')
    text = ' '.join(text.split())
    results['clean_text'] = text
    return results
def corpus_specific_text_cleaning(text):
    text = text.replace(" ,", ",")\
    .replace("( ", " (")\
    .replace(" )", ") ")
    return text
示例#11
0
                                      dictionary,
                                      sort_topics=False,
                                      n_jobs=-1)  #, R=15)
pyLDAvis.save_html(lda_display, displayname)
webbrowser.open(displayname, new=2)

#save backup
pickle.dump(ldamodel, open('ldamodel.pkl', 'wb'))

#save topics and terms
topics = pd.DataFrame(
    ldamodel.print_topics(num_topics=NUM_TOPICS, num_words=30)).drop(columns=0)

for i in range(len(topics)):
    text = topics.iat[i, 0]
    topics.iat[i, 0] = text.replace('*', ' ')  #clean for easier reading

topics.to_csv('gensim_topics.csv', index=False, header=False)
"""apply LDA model to each consolidated document for each category"""

#merge data
combined_text = pd.DataFrame(raw_data['Cat'].unique().tolist())

corpusdf = combined_text.copy(deep=True)

for i in range(0, len(combined_text)):
    Cat = combined_text.iat[i, 0]
    corpusdf.iat[i, 0] = raw_data[raw_data['Cat'].str.match(Cat)]['1'].str.cat(
        sep=' ')

#clean data
 def replace_words(text, dicty):
     for i, j in dicty.items():
         text = text.replace(i, j)
     return text
    if count_character > 0:
        while count_character > 0:
            print(count_character)
            print(character)
            start = text.find(character)
            end = text.find(" ", start)
            if end == -1:
                end = len(text)
            text_to_remove = text[start:end]
            print(text.count(text_to_remove))
            print(text_to_remove)
            if character == "#":
                hashtags.append(text_to_remove)
            elif character == "@":
                user_mentions.append(text_to_remove)
            text = text.replace(text_to_remove, "")
            text = ' '.join(text.split())
            count_character = text.count(character)
text = text.strip(' ')
text = ' '.join(text.split())
print(text)
print(user_mentions)
print(hashtags)

path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep
what_food_list_file = codecs.open(path + "list - what_food.txt",
                                  encoding='utf-8')
what_food_list = what_food_list_file.read().splitlines()
hashtags_with_what_words = []
for hashtag in hashtags:
    for word in what_food_list:
示例#14
0
def remove_punct(text):
    text = text.replace("/", " or ")
    text = regex.sub('', text)
    return text