예제 #1
0
 def remove_users(self):
     '''Takes a string and removes retweet and @user information'''
     self.text = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '',
                        self.text)  # remove retweet
     self.text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '',
                        self.text)  # remove tweeted at
     return self
예제 #2
0
def text_cleaning(text):
    stop = stopwords.words('english') + [
        "would", "could", "also", "one", "ha", "can't", "it's", "i've", "u",
        "it", "us", "we", "t", "s"
    ]  # define stopwords list

    # cleaning
    if pd.isnull(text):
        return ""
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text)  # remove URLs
    text = text.lower()  # to lowercase
    text = ''.join([i for i in text if not i.isdigit()])  # remove digits
    # text = text.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)  # remove unicodes and emojis
    text = re.sub(r'(.)\1+', r'\1\1', text)
    unis_emojis_pattern = re.compile(
        pattern="["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+",
        flags=re.UNICODE)
    text = unis_emojis_pattern.sub(r' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)  # remove punctuations
    split_text = text.split()
    text = ' '.join(x for x in text.split()
                    if x not in stop)  # remove stopwords
    return text
예제 #3
0
def clean_tweet(tweet):
    link_removed = re.sub('https?://[A-Za-z0-9./]+', '', tweet)
    number_removed = re.sub('[^a-zA-Z]', ' ', link_removed)
    lower_case_tweet = number_removed.lower()
    tok = WordPunctTokenizer()
    words = tok.tokenize(lower_case_tweet)
    clean_tweet = (' '.join(words)).strip()
    return clean_tweet
예제 #4
0
def clean_text(text):
    # remove backslash-apostrophe
    text = re.sub("\'", "", text)
    # remove everything except alphabets
    text = re.sub("[^a-zA-Z]", " ", text)
    # remove whitespaces
    text = ' '.join(text.split())
    # convert text to lowercase
    text = text.lower()
    return text
예제 #5
0
    def remove_hashtags(self):
        self.text = re.sub(pattern=get_hashtags_pattern(), repl=r'\1', string=self.text) #left word delete only hash tag
        #self.text = re.sub(pattern=get_hashtags_pattern(), repl=segement(r'\1'), string=self.text) #left word delete only hash tag
        # text = re.sub(r'#([^\s]+)', r'\1', text)
        #self.text = re.sub(pattern=get_hashtags_pattern(), repl='', string=self.text) #completely delete hashtag

        return self
예제 #6
0
def remove_special_character(text: str) -> str:
    """
    replace some special character with space from text
    :param text:
    :return:
    """
    # https://www.compart.com/en/unicode/
    # remove some non-chinese special symbols
    # https://unicode-table.com/en/blocks/basic-latin/
    # !"#$%'()*+,;<=>[\]^`{|}~
    # https://unicode-table.com/en/blocks/box-drawing/
    # 2500—257F
    # https://unicode-table.com/en/blocks/cjk-symbols-and-punctuation/
    # 3000—303F
    # https://unicode-table.com/en/blocks/cjk-compatibility-forms/
    # FE30—FE4F
    # https://unicode-table.com/en/blocks/halfwidth-and-fullwidth-forms/
    # FF00—FFEF
    return re.sub(
        r'['
        r'!"#$%\'()*+,;<=>[\\\]^`{|}~'
        r'\u2500—\u257F'
        r'\u3000-\u303F'
        r'\uFE30-\uFE4F'
        r'\uFF00-\uFFEF'
        r']+', ' ', text)
    def hashtags(self, w):
        # w = "#MOSHEBiran_TheKing-OFThe!World19"
        lst_parsed = []
        if w[0] == "#":

            # lst_parsed.append(w.replace("_", "").lower())
            w = w[1:len(w)]

            if w.isalpha() and (w.islower() or w.isupper()):
                lst_parsed.append(w.lower())
                return lst_parsed

            if w.isalpha() and ("_" or "-" or "~") not in w:
                myString = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', w)
                lst_parsed.extend([word.lower() for word in myString.split()])
                return lst_parsed

            else:

                myString = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', w)
                temp = ([word1.lower() for word1 in myString.split()])
                for word in temp:
                    if not word.isalpha():
                        start_of_next = 0
                        i = 0
                        while len(word) > i:

                            if word[i].islower():
                                while len(word) > i and word[i].islower():
                                    i += 1

                            elif word[i].isnumeric():
                                while len(word) > i and word[i].isnumeric():
                                    i += 1

                            else:
                                while len(word) > i and not word[i].isalnum():
                                    i += 1
                                    start_of_next = i
                                continue

                            lst_parsed.append(word[start_of_next: i].lower())
                            start_of_next = i
                            continue
                    else:
                        lst_parsed.append(word.lower())
        return lst_parsed
def build_training_set(train_file):
    t0 = time.time()
    
    nlp = en_core_web_sm.load()
    file = open(train_file, "r", encoding='utf8')
    uniques = [[],[]] #frequency array for each unique word in the corpus
    
    sents = [line for line in file if line != '\n']
    del sents[0] #remove unwanted extraneous carriage return at top of input file
    #for each sentence in the training file
    for s, sent in enumerate(sents[:2000]):
        #rebuild sentence and pass it the NLP parser for POS tagging
        parsed = nlp(re.sub("(\|\S*){2}(\s|$)"," ",sent))
        #initialise sentence container
        sents[s] = re.sub('\\n+','',sent).split(" ")
        quote = False #some sentences fail to close open quotes. This forces closure
        for t, (token, tagged) in enumerate(zip(sents[s], parsed)):
            word = token.split("|")
            if quote:
                quote = (tagged.tag_ != "''") #close currently open quote, as tagged by spacy
            else:
                quote = (tagged.tag_ == '``') #open a new quote, as tagged by spacy
            sents[s][t] = tuple([word[0], tagged.tag_, re.sub("^.-","",word[2]), quote]) #remove I/B prefixes
            #increment frequency of this word
            try:
                pos = uniques[0].index(word[0]) #find position in word array
            except:
                uniques[0].append(word[0]) #if not found then append the new word
                uniques[1].append(1) #and initialise with frequency of 1
            else:
                uniques[1][pos] += 1 #else increment frequency
        if s/2000 == int(s/2000): #track the time (this is a long process)
            print(time.time() - t0, 'secs to proc', s, 'sentences')
    
    """==========================build features and labels======================================================="""
    X_train = []
    y_train = []
    for sent in sents[:2000]:
        if len(sent) > 1:
            #first get the word frequencies (ie. do this once only for this sentence)
            frequency = [uniques[1][uniques[0].index(token[0])] for token in sent]
            #extract the features for each word in this sentence
            X_train.append([extract_features(sent, position, frequency) for position in range(len(sent))])
            #append the training labels for each word in this sentence
            y_train.append([token[2] for token in sent])
    
    return X_train, y_train
예제 #9
0
def convert_emoticons(old_text):
    smiley = emot.emoticons(old_text)
    new_text = re.sub(r'https?:\/\/.*[\r\n]*', '', old_text)  # remove URL before
    if len(smiley) > 1 and smiley['flag']:
        for i in range(0, len(smiley['value'])):
            new_text = old_text.replace(smiley['value'][i], " "+smiley['mean'][i]+" ")
            old_text = new_text
    return new_text
예제 #10
0
def preprocess_tweet(text):
    new_tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', text)  # remove URL
    new_tweet = re.sub(r'<[^>]+>', '',
                       new_tweet)  # remove html (line breaks etc.)
    new_tweet = re.sub(
        re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"), '',
        new_tweet)  # remove email
    new_tweet = re.sub(r'#', '', new_tweet)  # remove hash sign from hashtags
    new_tweet = re.sub("[^a-zA-Z]", " ",
                       new_tweet)  # remove remaining special characters
    words = new_tweet.lower().split()  # do lowercase, split into words
    words = [word for word in words
             if not word in stopwords_english]  # remove stop words
    words = [stemmer.stem(word) for word in words]  # stemming
    words = [lemma.lemmatize(word) for word in words]  # lemmatization
    # join words list back to one tweet
    return " ".join(words)
예제 #11
0
def convert_emphesize(text, return_count=False):
    emphs = re.findall(r'\b[A-Z]{2,}\b', text)
    emphs = set(emphs)
    if return_count:
        return len(emphs)
    for emph_ in emphs:
        text = re.sub(r'\b' + emph_ + r'\b', emph_ + ' emphh', text)
    return text
예제 #12
0
def convert_emojis(old_text):
    smiley = emot.emoji(old_text)
    new_text = old_text
    if smiley['flag']:
        for i in range(0, len(smiley['value'])):
            new_text = re.sub(smiley['value'][i], smiley['mean'][i], old_text)
            old_text = new_text
    return new_text
예제 #13
0
    def text_tokenizer(self, text: str):
        text = re.sub('\S*@\S*\s?', '', text)  # remove emails
        text = re.sub(r'^https?://.*[\r\n]*', '', text,
                      flags=re.MULTILINE)  # remove websites

        words = word_tokenize(text, 'english')
        words = list(filter(lambda word: len(word) >= self.min_length, words))

        # text = (list(map(lambda x: self.stemmer.stem(x), words)))
        tokens = (list(map(lambda x: self.lemmatizer.lemmatize(x), words)))
        p = re.compile('[a-zA-Z]+')
        filtered_tokens = list(
            filter(
                lambda token: p.match(token) and len(token) >= self.min_length,
                tokens))

        return filtered_tokens
def cleaning(sentences):
    words = []
    for s in sentences:
        clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
        w = nltk.word_tokenize(clean)
        # lemmatizing
        words.append([lemmatizer.lemmatize(i.lower()) for i in w])

    return words
예제 #15
0
def clean(string_rep):
    string_rep = re.sub('[^a-zA-Z .]', ' ', string_rep)
    string_rep = string_rep.lower()
    new_vocab = word_tokenize(string_rep)
    new_vocab = [
        w for w in new_vocab if w not in set(stopwords.words('english'))
    ]
    data = ' '.join(new_vocab)

    return data
예제 #16
0
def clean_text(text):
    text = text.lower()    # lower text
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = word_tokenize(text)
    text = [x for x in text if x not in stop_words]     # remove stop words
    pos_tags = pos_tag(text)
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]    # lemmatize text  -ing, -ed, s,ss,
    # text = [WordNetLemmatizer().lemmatize(t) for t in text]                                 # Other option of lemmatizer
    text = [t for t in text if len(t) > 1]     # remove words with only one letter or empty
    return (text)
예제 #17
0
파일: __init__.py 프로젝트: gelkin/iRnWs
def prepare_arguments(query, stopwords=set()):
    tokens = query.split()
    prepared_tokens = []
    stemmer = PorterStemmer()
    for token in tokens:
        token = re.sub('\W+', '', token)
        token = stemmer.stem(token)
        if token and token not in stopwords:
            prepared_tokens.append(token)
    return prepared_tokens
예제 #18
0
def preprocess_corpus(corpus: List[str]) -> List[str]:
	"""
    Preprocess nlp corpus

    Parameters
    ----------
    corpus : List[str]
        list of tweet texts

    Returns
    -------
    final_corpus : List[str]
        list of tweet texts, but processed
    """
	# Download the English stop words from the NLTK repository.
	nltk.download('stopwords', quiet=True)
	# Removing the links from the tweets (starting with https:// until a space)
	corpus_no_url = [re.sub('(https://)\S*(\s|$)', '', line) for line in corpus]
	# Removing stopwords from English language
	stopWords = set(stopwords.words('english'))
	corpus_no_stops_no_url = [" ".join([re.sub('\n', "", l) for l in line.split(" ") if l not in stopWords]) for line in
	                          corpus_no_url]
	# Removing @...
	corpus_noAt_no_stops_no_url = [re.sub('(@)\S*(\s|$)', '', line) for line in corpus_no_stops_no_url]
	# Remove #
	corpus_noht_noAt_no_stops_no_url = [re.sub('#', '', line) for line in corpus_noAt_no_stops_no_url]
	# Set lowercase
	corpus_lowercase = [line.lower() for line in corpus_noht_noAt_no_stops_no_url]
	# Remove numbers
	corpus_no_numbers = [re.sub('[0-9]+', '', line) for line in corpus_lowercase]
	final_corpus = corpus_no_numbers

	# if debug_prints:
	#     print("\n*** CORPUS BEFORE ***")
	#     [print("\t* " + str(l)) for l in corpus[0:5]]
	#     print()
	#
	#     print("\n\n*** CORPUS AFTER ***")
	#     [print("\t* " + str(l)) for l in final_corpus[0:5]]
	#     print()

	return final_corpus
예제 #19
0
def replace_owner_names(text):
    regex = r"dimitr(?:iou|io|i)s*|\bef{1,2}i|nan(?:c|s)+y|giannis*|marin*(?:a|o)+s*|mano(?:li)*s*|mike|mic*halis*|" \
            r"niko(?:la)*s*|artemis*|alex(?:andro|i)+s*|tasos*|al(?:i|y)+ki|g(?:i|e)+org(?:i|o)+s|george|efthimia|" \
            r"mohammed|eva|antonia|nikk*i(?:ta)*s*|mustafa|anton(?:io|i)+s*|vill*y|k(?:y|i)+riakos*|vett*a|giorgaros*|" \
            r"anthi|(?:ch|x)+ristina|yannis*|dion(?:i|y|u|h|e)+s(?:i|y|u|h|e)+s*|thaleia|(?:ch|x)+ristos*|" \
            r"andria(?:nna)*|panagi(?:oti)*s|theopoula|popp*(?:i|y)+|vass*il(?:eio|i)+s*|asimina|fross*o|tina|andreas*|" \
            r"geronim(?:u|o)s*|gerr*y|babis*|areti|nick|savvas*|stergos*|sergios*"

    replace = 'owner_name'

    return re.sub(regex, replace, text)
예제 #20
0
파일: __init__.py 프로젝트: gelkin/iRnWs
 def add_doc_to_index(self, doc_text: str, doc_id: int, stemmer):
     doc_text = doc_text.lower()
     tokens = doc_text.split()
     cnt = 0
     for token in tokens:
         token = re.sub('\W+', '', token)
         if not token:
             continue
         token = stemmer.stem(token)
         if token not in self.stopwords:
             self.index[token].append((doc_id, cnt))
             cnt += 1
예제 #21
0
def parseEuroparl():
    txt = ""
    test_txt = ""
    cc = 0
    print "Reading Files ..."
    X = len(glob.glob('/home/antok/Downloads/txt/en/*.txt'))
    for i, filename in enumerate(
            glob.glob('/home/antok/Downloads/txt/en/*.txt')):
        print "Progress %d of %d\r" % (i, X),
        if cc < 500:
            with open(filename, "r") as f:
                for line in f:
                    if line not in ['\n', '\r\n']:
                        line = re.sub('<.*.>', "", line)
                        txt += line
                if len(txt) != 0:
                    if txt[len(txt) - 1] != ".":
                        txt += "."
            cc += 1
        elif cc >= 500 and cc < 650:
            with open(filename, "r") as f:
                for line in f:
                    if line not in ['\n', '\r\n']:
                        line = re.sub('<.*.>', "", line)
                        test_txt += line
                    if test_txt[len(test_txt) - 1] != ".":
                        test_txt += "."
            cc += 1
        else:
            break

    print "decoding to utf-8 ..."
    txt = txt.decode('utf-8')
    test_txt = test_txt.decode('utf-8')

    print "tokenization ..."
    sentences = [sent for sent in sent_tokenize(txt)]
    test_sentences = [sent for sent in sent_tokenize(test_txt)]

    return txt, test_txt, sentences, test_sentences
예제 #22
0
def Parse_HTML(html):
    # Get the metadata if its available.
    meta_data = []
    meta_keywords = html.find("meta", attrs={"name": "keywords"})
    meta_description = html.find("meta", attrs={"name": "description"})
    if (meta_keywords):
        meta_data.append(("meta", meta_keywords.get("content")))
    elif (meta_description):
        meta_data.append(("meta", meta_description.get("content")))

    # Remove script tags which don't provide useful information.
    for script_tag in html("script"):
        script_tag.decompose()

    # Extract the headings and title from the html.
    headings = []
    for heading_tag in html.find_all(
        ["title", "h1", "h2", "h3", "h4", "h5", "h6"]):
        heading = heading_tag.extract()
        tag_name = "h"
        if (heading.name == "title"):
            tag_name = "title"
        heading_data = (tag_name, heading.get_text().replace("\n", " "))
        headings.append(heading_data)

    # Get the rest of the content and clean the format up.
    initial_text = html.get_text()
    edited_text = re.sub("\n +", "\n", initial_text)
    edited_text = re.sub("\n{3,}", "\n", edited_text)
    text = ("contents", edited_text)

    # Compiles all the extracted informatino into an arary.
    parsed_text = []
    for pair in headings:
        parsed_text.append(pair)
    for pair in meta_data:
        parsed_text.append(pair)
    parsed_text.append(text)

    return parsed_text
예제 #23
0
def clean_tweets(tweet):
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)

    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False,
                               strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)
    return tweets_clean
예제 #24
0
    def remove_numbers(self, preserve_years=False):
        text_list = self.text.split(' ')
        for text in text_list:
            if text.isnumeric():
                if preserve_years:
                    if not is_year(text):
                        text_list.remove(text)
                else:
                    text_list.remove(text)

        self.text = ' '.join(text_list)
        self.text = re.sub('([0-9]+)', '', self.text)
        return self
예제 #25
0
def pre_processing(rel3):
    #Removing punctuations
    rel3 = re.sub('[^a-zA-Z0-9_\.]', ' ', rel3)

    #Lemmatization
    lem = WordNetLemmatizer()
    rel3 = [lem.lemmatize(i, pos='v') for i in rel3]
    rel3 = [lem.lemmatize(i, pos='n') for i in rel3]
    rel3 = [lem.lemmatize(i, pos='a') for i in rel3]
    rel3 = ''.join(rel3)

    #Tokenization
    tokens = nltk.word_tokenize(rel3)
    return tokens
예제 #26
0
def clean_news(news):
    output_news = []
    filtered_sentence = []
    x = ' '.join(
        re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ',
               news).split())
    stop_words = set(stopwords.words('english'))
    # tokens of words
    word_tokens = word_tokenize(x)
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)

    output_news.append(" ".join(filtered_sentence))
    # print(analyze_sentiment(output_news[0]))
    return analyze_sentiment(output_news[0])
예제 #27
0
def preprocessing (df):
    corpus = []
    pstem = PorterStemmer()
    for i in range(df['text'].shape[0]):
        # Remove unwanted words
        text = re.sub("[^a-zA-Z]", ' ', df['text'][i])
        # Transform words to lowercase
        text = text.lower()
        text = text.split()
        # Remove stopwords then Stemming it
        text = [pstem.stem(word) for word in text if not word in set(stopwords.words('english'))]
        text = ' '.join(text)
        # Append cleaned tweet to corpus
        corpus.append(text)
    print("Corpus created successfully")

    return corpus
예제 #28
0
def stemming(string, top):
    string_rep = re.sub('[^a-zA-Z]', ' ', string)
    string_rep = string_rep.lower()
    new_vocab = word_tokenize(string_rep)
    #inter_stem=[ps.stem(l) for l in new_vocab]
    vocab_stem = [
        ps.stem(w) for w in new_vocab
        if w not in set(stopwords.words('english'))
    ]
    dictry_stem = set(vocab_stem)
    data = ' '.join(vocab_stem)
    arr = []
    c = 0
    t = 0
    for i in dictry_stem:
        c = len(re.findall(i, data))
        t = t + c
        arr.append([i, c])
        #print(i,c)
    aux = []
    for q, w in arr:
        if w > 1:
            aux.append([q, w])

    aux.sort(key=lambda i: i[1], reverse=True)
    aux = [w for w in aux if w[1] > 1]

    x = []
    y = []
    for wrd, co in aux:
        x.append(wrd)
        y.append(co)

    print("total number of words:", t, "\t set of words:",
          len(set(dictry_stem)))
    #print("Top ",top," words occurring more than twice: ",aux)
    print("%age of total length these word account for:", (sum(y) / t) * 100)
    #plt.bar(x,y,color='red',alpha=0.8)
    #plt.xlabel('Word')
    #plt.ylabel('Frequency')
    #plt.title('Words occurring more than twice')
    return (x, string_rep)
예제 #29
0
def prepare_document(text):
    tokenizer = RegexpTokenizer(r'\w+')

    raw = text.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    filtered_tokens = [i for i in tokens if i not in stopwords.words('english')]

    # remove numbers
    number_tokens = [re.sub(r'[\d]', ' ', i) for i in filtered_tokens]
    number_tokens = ' '.join(number_tokens).split()

    # stem tokens
    stemmed_tokens = [PorterStemmer().stem(i) for i in number_tokens]

    # remove empty
    length_tokens = [i for i in stemmed_tokens if len(i) > 1]

    return length_tokens
예제 #30
0
def get_url(urls):
    response = request.urlopen(urls)
    rw = str(response.read().decode('utf-8'))
    soup = BeautifulSoup(rw, "html.parser")
    # Removing script and style elements
    for script in soup(["script", "style"]):
        script.decompose()
    meta = soup.find_all('meta')
    rel3 = ''
    for tag in meta:
        if 'name' in tag.attrs.keys() and tag.attrs['name'].strip().lower(
        ) in ['description', 'keywords']:
            rel3 += tag.attrs['content']
            rel3 += '\n'

    # get plain text
    text_soup = soup.get_text()

    # Removing leading and trailing spaces
    lines = (line.strip() for line in text_soup.splitlines())
    # Breaking headlines into a line
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # Removing blank lines
    text_chunk = '\n'.join(chunk for chunk in chunks if chunk)
    rel3 += text_chunk

    rel3 = re.sub('[^a-zA-Z0-9_\.]', ' ', rel3)

    #Lemmatization
    lem = WordNetLemmatizer()
    rel3 = [lem.lemmatize(i, pos='v') for i in rel3]
    rel3 = [lem.lemmatize(i, pos='n') for i in rel3]
    rel3 = [lem.lemmatize(i, pos='a') for i in rel3]
    rel3 = ''.join(rel3)

    #print(rel3)

    #Tokenization
    rel3 = nltk.word_tokenize(rel3)

    return rel3
def remove2(t):
    t=str(t)
    t=re.sub('\\s+', ',', t)
    t=t.split(',')
    return t
def remove(line):
    line = re.sub(';', ' ', line)
    line =line.split("\\s+")
    return line
예제 #33
0
test_data_labels=y_train1
#print "real labels"
#print y_train1 
#clf1 =   GradientBoostingClassifier(n_estimators = 1000)
#clf2 =   AdaBoostClassifier(n_estimators = 1000)
#clf3 =   RandomForestClassifier()
#print "the test data is"
#print test[1:3]
f=open('/home/mohan/Downloads/Theano-Tutorials-master/crawledcontents95.txt')
raw = f.read()
u1 = unicode(raw, "utf-8")
string = unicodedata.normalize('NFKD', u1).encode('ascii','ignore')
sent = nl.sent_tokenize(string)
test_sents = []
for i in range(len(sent)):
    test_sents.append(' '.join(nl.word_tokenize(re.sub('''[!@#$(),.;'":/\n?!\W]''',' ', sent[i]))))
###########################################################################################################################################
clf1.fit(train_data2,y_train1)
X_test1 = vctr.transform(test_sents).toarray()
grad_label = clf1.predict(X_test1)
grad_label = grad_label.astype(int)
print "grad_label"
print grad_label
for i in range(len(grad_label)):
    print test_sents[i] ,'=>', grad_label[i]
################################################grad prob#######################################
clf1.fit(train_data2,y_train1)
X_test1 = vctr.transform(test_sents).toarray()
grad_label1 = clf1.predict_proba(X_test1)
#grad_label1 = grad_label.astype(int)
print "grad_label with probablistic labels "
예제 #34
0
def reformatWord(word):
    alphaNumericString = re.sub(r'\W+', '', word) # keep only alphanumeric characters
    stemmer = nltk.stem.snowball.EnglishStemmer()
    stemmedString = stemmer.stem(alphaNumericString)
    return stemmedString