def removestop(text):
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = text.lower().split()
    stops = ['so', 'his', 't', 'y', 'ours', 'herself', 'your', 'all', 
    'some', 'they', 'i', 'of', 'didn', 
    'them', 'when', 'will', 'that', 'its', 'because', 
    'while', 'those', 'my', 'don', 'again', 'her', 'if',
    'further', 'now', 'does', 'against', 'won', 'same', 
    'a', 'during', 'who', 'here', 'have', 'in', 'being', 
    'it', 'other', 'once', 'itself', 'hers', 'after', 're',
    'just', 'their', 'himself', 'theirs', 'whom', 'then', 'd', 
    'out', 'm', 'mustn', 'where', 'below', 'about', 'isn',
    'shouldn', 'wouldn', 'these', 'me', 'to', 'doesn', 'into',
    'the', 'until', 'she', 'am', 'under', 'how', 'yourself',
    'couldn', 'ma', 'up', 'than', 'from', 'themselves', 'yourselves',
    'off', 'above', 'yours', 'having', 'mightn', 'needn', 'on', 
    'too', 'there', 'an', 'and', 'down', 'ourselves', 'each',
    'hadn', 'ain', 'such', 've', 'did', 'be', 'or', 'aren', 'he', 
    'should', 'for', 'both', 'doing', 'this', 'through', 'do', 'had',
    'own', 'but', 'were', 'over', 'not', 'are', 'few', 'by', 
    'been', 'most', 'no', 'as', 'was', 'what', 's', 'is', 'you', 
    'shan', 'between', 'wasn', 'has', 'more', 'him', 'nor',
    'can', 'why', 'any', 'at', 'myself', 'very', 'with', 'we', 
    'which', 'hasn', 'weren', 'haven', 'our', 'll', 'only',
    'o', 'before']
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    text = text.replace("."," ").replace(","," ")
    return(text)
def transform_review_text(text):
    text = text.lower()
    text = re.sub('[^a-z ]', '', text)
    text_array = [
        stemmer.stem(word) for word in text.split() if word not in stopwords
    ]
    return ' '.join(text_array)
예제 #3
0
def clean_text_round1(text):
    '''Make text lowercase, remove punctuation, mentions, hashtags and words containing numbers.'''
    # make text lowercase
    text = text.lower()
    # removing text within brackets
    text = re.sub('\[.*?\]', '', text)
    # removing text within parentheses
    text = re.sub('\(.*?\)', '', text)
    # removing numbers
    text = re.sub('\w*\d\w*', '', text)
    # if there's more than 1 whitespace, then make it just 1
    text = re.sub('\s+', ' ', text)
    # if there's a new line, then make it a whitespace
    text = re.sub('\n', ' ', text)
    # removing any quotes
    text = re.sub('\"+', '', text)
    # removing &
    text = re.sub('(\&amp\;)', '', text)
    # removing any usernames
    text = re.sub('(@[^\s]+)', '', text)
    # removing any hashtags
    text = re.sub('(#[^\s]+)', '', text)
    # remove `rt` for retweet
    text = re.sub('(rt)', '', text)
    # string.punctuation is a string of all punctuation marks
    # so this gets rid of all punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # getting rid of `httptco`
    text = re.sub('(httptco)', '', text)
    return text
예제 #4
0
def predict():

    # Prediction text
    text = request.args.get('text') if request.args.get('text') else ""
    # Split text using non alpha numeric, then remove stopwords
    text = ' '.join(
        [i for i in re.split('[^a-z0-9]', text.lower()) if i not in stop])

    # Number of predictions
    predictions = int(
        request.args.get('num')) if request.args.get('num') else 3

    retval = dict()
    slicing_value = -1 * (predictions + 2)
    cat_train.insert(0, text)
    tfidf = TfidfVectorizer(max_df=0.4,
                            min_df=2,
                            stop_words='english',
                            norm='l1',
                            use_idf=True).fit_transform(cat_train)
    cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
    related_docs_indices = cosine_similarities.argsort()[-2:slicing_value:-1]
    for index, x in enumerate(list(related_docs_indices)):
        cos_value = list(cosine_similarities)[x]
        if cos_value > 0:
            retval[category[x - 1]] = str(cos_value)
    del cat_train[0]
    response = jsonify(retval)
    response.headers.add('Access-Control-Allow-Origin', CORS)
    #response.headers.add('Access-Control-Allow-Origin', 'https://dashboard.heroku.com/apps/bazarr-web')
    return response
def preProcess(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    filtered_words = [w for w in tokens if not w in stopwords.words('english') and not w.isdigit()]
    print "preprocess:" + " ".join(filtered_words)
    return " ".join(filtered_words)
예제 #6
0
 def clean_text_round1(text):
     '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
     text = text.lower()
     text = re.sub('\[.*?\]', '', text)
     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
     text = re.sub('\w*\d\w*', '', text)
     return text
예제 #7
0
 def clean_text(self):
     text = re.sub('[^a-zA-z]', ' ', str(self.doc))
     text = re.sub('\[.*?\]', ' ', text)
     text = re.sub('\d', ' ', text)
     text = " ".join(text.split())
     text = text.lower()
     return text
예제 #8
0
def clean_lowercase(text):
    # remove nan values
    if str(text) == 'nan':
        text = re.sub('nan', '', str(text))
    else:
        text = text.lower()
    return text
 def MyNormalize(text):
     tokens = nltk.word_tokenize(text.lower())
     tokens = [t for t in tokens if t not in string.punctuation]
     toks = [
         WNL.lemmatize(t) if t not in exceptions else t for t in tokens
     ]
     return toks
예제 #10
0
def ProcessText(text):
    """
    processes either a string or list of tokenized words
    """
    if isinstance((text), (str)):
        # remove non-word characters and lower the letters
        text = re.sub(r"<[^>]*>", r"", text)
        text = re.sub(r"#(\w+)", "", text)
        text = re.sub(r"@", "", text)
        text = re.sub(r"[^a-zA-Z0-9\s]+", r"", text.lower())
        # remove url strings from text
        text = RemoveUrl(text)
        # remove extra spaces
        text = re.sub(r"^ +", r"", text)
        text = re.sub(r" +", r" ", text)
        return text
    if isinstance((text), (list)):
        return_list = []
        for i in range(len(text)):
            # remove non-word characters and lower the letters
            temp_text = re.sub(r"<[^>]*>", r"", text[i])
            temp_text = re.sub(r"#(\w+)", "", temp_text)
            temp_text = re.sub(r"@", "", temp_text)
            temp_text = re.sub(r"[^a-zA-Z0-9\s]+", r"", temp_text.lower())
            # remove url strings from text
            temp_text = RemoveUrl(temp_text)
            # remove extra spaces
            temp_text = re.sub(r"^ +", r"", temp_text)
            temp_text = re.sub(r" +", r" ", temp_text)
            return_list.append(temp_text)
        return return_list
    else:
        pass
def textClean(text):
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = text.lower().split()
    # May change here (remove all the stop words)
    stops = set(my_stop_words)
    text = [w for w in text if not w in stops]    
    text = " ".join(text)
    return(text)
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    '''Get rid of some additional punctuation and nonsensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text
예제 #13
0
def generate_spelling_graph(text):
    text = text.lower()
    text = remove_punct(text)
    text = tokenizing(text)
    text = remove_stopwords(text)
    misspelled = spell.unknown(text)
    sizes = [len(text)-len(misspelled), len(misspelled)]
    labels = 'correctly spelled words', 'misspelled words'
    generate_pie_chart(sizes, labels)
    generate_bar_chart(text, misspelled, "Occurance")
예제 #14
0
def sentimentAFINN(text):

    words = pattern_split.split(text.lower())
    sentiments = map(lambda word: afinn.get(word, 0), words)
    if  sentiments:
        sentiment = float(sum(sentiments))/math.sqrt(len(sentiments))
        
    else:
        sentiment = 0
    return sentiment
예제 #15
0
def initial_clean(text):
    """
    Function to clean text-remove punctuations, lowercase text etc.
    """
    # remove_digits and special chars
    text = re.sub("[^a-zA-Z ]", "", text)

    text = text.lower()  # lower case text
    text = nltk.word_tokenize(text)
    return text
예제 #16
0
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    #print('text after changing to lower case:- %s' %(text))
    text = re.sub('\[.*?\]', '', text)  # removes brackets.
    text = re.sub('[%s]' % re.escape(string.punctuation), '',
                  text)  # removes punctuation markes from a string.
    text = re.sub(
        '\w*\d\w*', '', text
    )  # \w* means remove numbers from a string \d removes digitis from string.
    return text
예제 #17
0
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = text.replace('-', ' ')
    text = text.replace("’s ", " ").replace("' ", " ")
    text = text.replace("s’ ", " ").replace("s' ",
                                            " ")  #remove proper possesives
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
예제 #18
0
def process_text(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    return " ".join(result)
예제 #19
0
def clean_data(text):
    """ Clean data part 1: Lower case,  """
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\t', '', text)
    text = re.sub('\b', '', text)
    text = re.sub('[^a-z ]+', '', text)
    text = re.sub('\s\s+', ' ', text)
    return text
예제 #20
0
def process(text, tokenizer=TweetTokenizer(), stopwords=[]): 
  """Process the text of a tweet: 
  - Lowercase 
  - Tokenize 
  - Stopword removal 
  - Digits removal 
 
  Return: list of strings 
  """ 
  
  text = text.lower()
  tokens = tokenizer.tokenize(text)
  return [tok for tok in tokens if tok not in stopwords and not any(i.isdigit() for i in tok)  and len(tok)>2 and tok is tok.strip('#') and tok is tok.strip('@')  and tok not in (tok for tok in tokens if re.search('http', tok)) ]
예제 #21
0
def prepare_text(text):
    tweet_text = []
    try:
        tokens = nltk.word_tokenize(text.lower())
        clean_tokens = [word for word in tokens if word not in stopwords]
        stem_words = [stemmer.stem(token) for token in clean_tokens]
        tweet_text.append(" ".join(stem_words))
        X_test = tf_vect.transform(tweet_text)
        
    except Exception:
        pass
    
    return X_test
예제 #22
0
def sentimentAFINN(text):
    """
    Returns a float for sentiment strength based on the input text.
    Positive values are positive valence, negative value are negative valence. 
    """
    words = pattern_split.split(text.lower())
    sentiments = map(lambda word: afinn.get(word, 0), words)
    if sentiments:
        # How should you weight the individual word sentiments? 
        # You could do N, sqrt(N) or 1 for example. Here I use sqrt(N)
        sentiment = float(sum(sentiments))/math.sqrt(len(sentiments))
        
    else:
        sentiment = 0
    return sentiment
예제 #23
0
def preprocessing(text):
    if type(text) == pd.Series:
        text = [entry.lower() for entry in text]
        text = [remove_punct(entry) for entry in text]
        text = [tokenizing(entry) for entry in text]
        text = [remove_stopwords(entry) for entry in text]
        text = [lemmatizing(entry) for entry in text]
        text = [" ".join(entry) for entry in text]
    elif type(text) == str:
        text = text.lower()
        text = remove_punct(text)
        text = tokenizing(text)
        text = remove_stopwords(text)
        text = lemmatizing(text)
        text = " ".join(text)
    return text
예제 #24
0
        def cleaner(
            text,
        ):
            filters_regex = r'(?!\')(?:\W|_)'
            clean_text = re.sub(
                pattern=filters_regex,
                repl=' ',
                string=text.lower(),
            )
            clean_text = re.sub(
                pattern=r'\s+',
                repl=' ',
                string=clean_text,
            )

            return clean_text
예제 #25
0
def clean_text(text):
    '''
    Parameters:
        ing
    Returns:
        String
    This function does the following process on the text:
        convert to lowercase
        removes punctuation
        removes special characters
    '''
    '''
    New implementation to remove the punctuation and replace with space
    Ref: https://stackoverflow.com/questions/42614458/how-to-replace-punctuation-with-whitespace
    '''
    punc_list = list(string.punctuation)
    translator = text.maketrans(dict.fromkeys(punc_list, " "))
    cleantext = text.lower().translate(translator)
    ## clear off numbers and normalize spaces between words
    ## and lowercase it
    cleantext = " ".join(
        [s for s in cleantext.split(" ") if s.strip() is not ""]).lower()
    ## remove any non-printable (non-ascii) characters in the text
    printable = set(string.printable)
    cleantext = list(filter(lambda x: x in printable, cleantext))
    cleantext = "".join(cleantext)
    ## remove roman numberals from string which
    ## are not in brackets
    toremove = [
        ' ii ', ' iii ', ' iv ', ' v ', ' vi ', ' vii ', ' viii ', ' ix ',
        ' x ', '!', '@', '#', '$', '%', '^', '&', '*', '$.'
    ]
    text_array = cleantext.split("\s+")
    cleantext = [word.strip() for word in text_array if word not in toremove]
    cleantext = " ".join(cleantext)

    ## clear off all arabic numerals / digits in the text which are attached
    ## together with text

    numbers = [1]
    while (len(numbers) != 0):
        numbers = re.findall('\d+', cleantext)
        for number in numbers:
            cleantext = cleantext.replace(number, " ")

    cleantext = re.sub(' +', ' ', cleantext)
    return cleantext.strip()
예제 #26
0
def get_parts(text):
    nouns = set()
    descriptives = set()

    text = text.lower().split()
    text = [i for i in text if i not in stop]
    text = [i for i in text if i not in punctuation]
    text = [i for i in text if len(i) > 1]
    for word, pos in nltk.pos_tag(
            text
    ):  # remove the call to nltk.pos_tag if `sentence` is a list of tuples as described above

        if pos in ['NN', "NNP"]:  # feel free to add any other noun tags
            nouns.add(word)
        elif pos in ["JJ", "JJR"]:
            descriptives.add(word)
    return list(nouns), list(descriptives)
예제 #27
0
def tokenize_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = text.translate(string.punctuation)
    words = nltk.word_tokenize(text)
    words = [re.sub("[^A-Za-z0-9]", "", word) for word in words]

    final_words = []
    for word in words:
        if not word:
            continue
        if word in nltk.corpus.stopwords.words("english"):
            continue
        if word.startswith("@") or word.startswith("#"):
            continue
        if word.isnumeric():
            continue
        final_words.append(word)
    return final_words
예제 #28
0
def clean_text(text, remove_stop_words=False):
    text = text.lower()
    replace_punctuation = str.maketrans(string.punctuation,
                                        ' ' * len(string.punctuation))
    text = text.translate(replace_punctuation)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub('[\n\r]', '', text)

    if remove_stop_words == True:
        text = text.split()
        new_text = []
        stemmer = PorterStemmer()

        for word in text:
            if word not in STOPWORDS:
                new_text.append(stemmer.stem(word))

        text = ' '.join(new_text)

    return text
예제 #29
0
def clean_text(text, country):
    text = reduce(lambda a, kv: a.replace(*kv), contractions.items(),
                  text.lower())
    text = text.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
    text = strip_accents(text)
    text = text.translate(
        str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    tokens = tk.tokenize(text)
    if country == 'USA':
        stopwords = usa_stopwords
    elif country == 'Canada':
        stopwords = canada_stopwords
    elif country == 'UK':
        stopwords = britain_stopwords
    else:
        raise ValueError("Country is invalid.")
    tokens = [
        w for w in tokens
        if w not in stopwords and len(w) > 2 and w != ' ' and not w.isdigit()
    ]
    return ' '.join(tokens)
예제 #30
0
def postTags():
    retval = {'tags': []}
    text = re.sub('%20', ' ',
                  request.args.get('text')) if request.args.get('text') else ""

    # Remove characters that are not punctuations, numbers and alphabets
    text = re.sub('[^a-zA-Z0-9\s.?!-]', '', text)
    # Remove extra spaces and tabs
    text = re.sub('[\s+]', ' ', text)

    print(text)

    blob = TextBlob(text.lower())
    temp = blob.tags

    for i in range(len(temp)):
        if temp[i][1] == 'JJ':
            k = 1
            found = ''
            while i + k < len(temp) and k < 2:
                if temp[i + k][1] == 'NN' or temp[i + k][1] == 'NNP':
                    retval['tags'].append(
                        string.capwords(temp[i][0] + ' ' + temp[i + k][0]))
                    temp[i + k] = (temp[i + k][0], 'DONE')
                    break
                k += 1

    for i in range(len(temp)):
        if temp[i][
                1] == 'NNP' or temp[i][1] == 'VBN' and temp[i][0] not in stop:
            retval['tags'].append(string.capwords(temp[i][0].lemmatize()))

    retval['tags'] = list(set(retval['tags']))
    response = jsonify(retval)
    response.headers.add('Access-Control-Allow-Origin', CORS)
    return (response)
예제 #31
0
def tokenize_and_stem(text):
	return stem(text.lower().split(" "))
예제 #32
0
 def normalizeWords(self, text):
     return re.compile(r'\W+', re.UNICODE).split(text.lower())