Exemplo n.º 1
0
def get_stopwords(filename):
    stopwords = []
    f = open(filename, 'r')
    for line in f:
    	line = line.strip(" \n\t")
        stopwords.append(line)
    f.close()
    return stopwords
Exemplo n.º 2
0
 def getTermsAndStopList(self, str):
     stopwords = []
     words = []
     tokens = self.tokenizer.tokenize(str)
     for token in tokens:
         if token.isdigit() == False:
             if token not in self.stop:
                 words.append(token)
             else:
                 stopwords.append(token)
     terms = [self.stemmer.stem(word) for word in words]
     return terms, stopwords
Exemplo n.º 3
0
    def read(self, list, path):
        #read the file (forcing utf8) and output a list
        lemmer = WordNetLemmatizer()

        #count the words ,save those that have a frequency of 15 or less
        #so that the algorithm doesn't get distracted by irrellevant non-recurrent words
        data_file = open(path, 'rt', encoding='utf8', errors='replace')
        cn = Counter(word for l in data_file for word in l.split())
        words = dict((word, v) for word, v in cn.items() if v < 6)
        words_list = words.keys()
        data_file.close()

        #read the stopwords file and create a list with them
        stopwords_f = open('stopwords.txt',
                           'rt',
                           encoding='utf8',
                           errors='replace')
        stopwords = []
        for line in stopwords_f:
            stopwords.append(str(line.strip()))
        stopwords_f.close()

        data_file = open(path, 'rt', encoding='utf8', errors='replace')
        for line in data_file:
            #manually check for stopwords
            new_line = ''

            #manually eliminate puntuation and grammar stuff, according to stastics
            #of the data, least repeated and most repeated words (irrellevant ones)
            line = str(line).replace(",", " ")
            line = str(line).replace(' " ', ' ')
            line = str(line).replace(".", "")
            line = str(line).replace(" ?", "")
            line = str(line).replace(" : ", " ")
            line = str(line).replace(" ; ", " ")
            line = str(line).replace(" ( ", " ")
            line = str(line).replace(" ) ", " ")
            line = str(line).replace(". . .", " ")
            line = str(line).replace(" -- ", " ")

            #remove some stopwords manually as well as digits from the strings
            line = self.replaceMultiple(str(line), words_list, ' ')
            #remove least used words (words that have a frequency of 3 and less)
            line = self.replaceMultiple(str(line), stopwords, ' ')

            #lematize the final words in the line
            for word in line.split(' '):
                new_line = new_line + ' ' + str(lemmer.lemmatize(word))
            list.append(line.rstrip())

        data_file.close()
        #output the lsit
        return list
	def remove_stop_word(tweet):
        

        stopwords = []
        with open("english") as files:
            for line in files:
                values = line.split()
                word = values[0]
                stopwords.append(word)
        pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
        tweet = pattern.sub('', tweet)
        return tweet
def remove_stopwords(df):
    """ Removes stopwords based on a known set of stopwords
    available in the nltk package. In addition, we include our
    made up word in here.
    """
    # Luckily nltk already has a set of stopwords that we can remove from the texts.
    stopwords = nltk.corpus.stopwords.words('english')
    # we'll add our own special word in here 'qwerty'
    stopwords.append(our_special_word)

    df['stopwords_removed'] = list(map(lambda doc:
                                       [word for word in doc if word not in stopwords],
                                       df['tokenized_text']))
def get_stopwords():
    initial_stopwords = [
        "a", "about", "above", "across", "after", "afterwards", "again",
        "against", "all", "almost", "alone", "along", "already", "also",
        "although", "always", "am", "among", "amongst", "amoungst", "amount",
        "an", "and", "another", "any", "anyhow", "anyone", "anything",
        "anyway", "anywhere", "are", "around", "as", "at", "back", "be",
        "became", "because", "become", "becomes", "becoming", "been", "before",
        "beforehand", "behind", "being", "below", "beside", "besides",
        "between", "beyond", "bill", "both", "bottom", "but", "by", "call",
        "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de",
        "describe", "detail", "did", "do", "does", "done", "down", "due",
        "during", "each", "eg", "eight", "either", "eleven", "else",
        "elsewhere", "empty", "enough", "etc", "even", "ever", "every",
        "everyone", "everything", "everywhere", "except", "few", "fifteen",
        "fifty", "fill", "find", "fire", "first", "five", "for", "former",
        "formerly", "forty", "found", "four", "from", "front", "full",
        "further", "get", "give", "go", "had", "has", "hasnt", "have", "he",
        "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon",
        "hers", "herself", "him", "himself", "his", "how", "however",
        "hundred", "i", "ie", "if", "in", "inc", "indeed", "interest", "into",
        "is", "it", "its", "itself", "keep", "last", "latter", "latterly",
        "least", "less", "ltd", "made", "many", "may", "me", "meanwhile",
        "might", "mill", "mine", "more", "moreover", "most", "mostly", "move",
        "much", "must", "my", "myself", "name", "namely", "neither", "never",
        "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor",
        "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once",
        "one", "only", "onto", "or", "other", "others", "otherwise", "our",
        "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
        "please", "put", "rather", "re", "same", "see", "seem", "seemed",
        "seeming", "seems", "serious", "several", "she", "should", "show",
        "side", "since", "sincere", "six", "sixty", "so", "some", "somehow",
        "someone", "something", "sometime", "sometimes", "somewhere", "still",
        "such", "system", "take", "ten", "than", "that", "the", "their",
        "them", "themselves", "then", "thence", "there", "thereafter",
        "thereby", "therefore", "therein", "thereupon", "these", "they",
        "thick", "thin", "third", "this", "those", "though", "three",
        "through", "throughout", "thru", "thus", "to", "together", "too",
        "top", "toward", "towards", "twelve", "twenty", "two", "un", "under",
        "until", "up", "upon", "us", "very", "via", "was", "we", "well",
        "were", "what", "whatever", "when", "whence", "whenever", "where",
        "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever",
        "whether", "which", "while", "whither", "who", "whoever", "whole",
        "whom", "whose", "why", "will", "with", "within", "without", "would",
        "yet", "you", "your", "yours", "yourself", "yourselves"
    ]
    stopwords = list()
    for s in initial_stopwords:
        s = normalize_string(s)
        stopwords.append(s)
    return initial_stopwords
def load_en_stopwords(filename):
    '''Loads English stop-words from a given file 
    
    Return: 
        a list of stop words
    Arguments: 
        the stop-words file name
    '''
    
    stopwords = []
    with codecs.open(filename, mode='r', encoding='utf-8') as fSW: 
        for line in fSW: 
            stopwords.append(line.strip().lower())
    return stopwords
Exemplo n.º 8
0
def ques23(entiresentence,string,np,x,y):
    stopwords=[]
    querywords = string.split()
    #print("querywords",querywords)
    grammar = "chunk:{<VB.?|MD|RP|RB.?>+<DT>?<RB.?>*<JJ.?>*<NN.?|PRP|PRP$|POS|VBG|DT|CD|VBN>+}"
    res,sentence=findChunk(string,grammar,"chunk")
    #print("res",res,"sentence",sentence)
#     for subtree in sentence.subtrees():
#         if subtree.label() == 'chunk':
#             print(subtree)
#     for pr in np:
#         if()
    if(len(sentence)>0):
        for words in sentence:
            stopwords.append(words[0])
        #print("sp",stopwords)
        stopwords.append(np)
        stopwords.append(x)
        stopwords.append(y)
        stopwords=set(stopwords)
        #print(stopwords)
        resultwords  = [word for word in querywords if word not in stopwords]
        resultwords = ' '.join(str(e) for e in resultwords)
        #print("rs",resultwords)
        WhatQues="What "+x+" "+np+" "+y+" "+resultwords+"?"
        #print(WhatQues)
        AnsQues[entiresentence].append(WhatQues)
Exemplo n.º 9
0
def get_hindi_stopwords(filename="stop-words.txt"):
    """Get stopwords in Hindi.

    Args:
    filename - stopwords_set words file

    Returns:
    set of hindi stop words
    """
    stopwords = []
    with open(filename, "r") as stop_words:
        for word in stop_words.readlines():
            stopwords.append(unidecode(word.strip().decode("utf8")))
    return set(stopwords)
Exemplo n.º 10
0
def preprocessing(tagged_by_Sentence):
    """
    1. 특수문자 제거, 소문자
    2. not, "n't" -> not_stemming(다음단어)
    3. 특수문자 제거, 숫자 제거
    4. stopword 제거
    5. stemming
    """
    from nltk.corpus import stopwords
    stopwords = stopwords.words('english')
    stopwords.remove("not")
    stopwords.remove('very')
    stopwords.append("'m")
    stopwords.append("'s")

    re_special = re.compile('[^A-Za-z0-9]+')  # 문자,숫자 제외한 나머지
    re_num = re.compile('[0-9]+')  # 숫자
    st = PorterStemmer()
    new_sent = []
    not_indice = []

    for sent in tagged_by_Sentence:
        text = [(tup[0].lower(), tup[1]) for tup in sent
                if not bool(re_special.match(tup[0]))]  # 1. 특수문자 제거, 소문자

        # 2. not, n't 랑 다음단어 합치기
        # not, n't 가 나오면 다음 단어랑 합치고, 그 다음 단어의 index를 저장해놨다가 del_element_by_indice 함수에서 제거
        new_text = []
        for index, tup in enumerate(text):
            if tup[0] == "n't" or tup[0] == "not":
                if index + 1 < len(text):
                    if not bool(re_special.match(
                            text[index + 1][0])) or text[index + 1][1] != 'CD':
                        new_text.append("not_" + st.stem(text[index + 1][0]))
                        not_indice.append(index)
                else:
                    new_text.append("not")
            else:
                if not bool(re_num.match(
                        tup[0])) or tup[1] != 'CD':  # 3. 특수문자, 숫자 제거
                    new_text.append(tup[0])
        new_text = del_element_by_indice(new_text, not_indice)

        new_words = [
            st.stem(word) for word in new_text if word not in stopwords
        ]  # 4,5 stopword 제거, stemming
        new_sent.append(new_words)
    return new_sent
def tokenize_headlines_with_sentiment(df):
    
    headlines = df.title.tolist()

    all_bigrams = []

    headlines_string = (' '.join(filter(None, headlines))).lower()
    tokens = word_tokenize(headlines_string)

    # Remove single letter tokens
    tokens_sans_singles = [i for i in tokens if len(i) > 1]

    # Remove stop words
    stopwords = nltk.corpus.stopwords.words('english')
    new_words=("s'","'s","election", "2020", "n't", "wo","...", "'")
    for i in new_words:
        stopwords.append(i)

    tokens_sans_stop = [t for t in tokens_sans_singles if t not in stopwords]
    tokens_sans_stop = [t.replace('wins', 'win') for t in tokens_sans_stop]

    # Get bigrams and frequencies
    bi_grams = list(ngrams(tokens_sans_stop, 2)) 
    counter = Counter(bi_grams)

    # Convert counter dictionary to dataframe
    counter_df = pd.DataFrame.from_dict(counter, orient='index').reset_index().rename(columns={"index": "bigram", 0: "freq"})
    counter_df_sort = counter_df.sort_values(by=['freq'],ignore_index=True, ascending=False)


    # Create concatenated bigram string for sentiment scoring
    counter_df_sort['word1'], counter_df_sort['word2'] = counter_df_sort.bigram.str
    counter_df_sort['bigram_joined'] = counter_df_sort.word1 + " " + counter_df_sort.word2
    counter_df_sort=counter_df_sort.drop(['word1','word2'], axis=1)

    # get sentiment for bigrams
    analyzer = SentimentIntensityAnalyzer()
    bigrams_scores = counter_df_sort['bigram_joined'].apply(analyzer.polarity_scores).tolist()
    df_bigrams_scores = pd.DataFrame(bigrams_scores).drop(['neg','neu','pos'], axis=1).rename(columns={"compound": "sentiment_compound"})
    bigrams_freq_and_scores = counter_df_sort.join(df_bigrams_scores, rsuffix='_right')

    print(f"There are {len(bigrams_freq_and_scores)} extracted bigrams across all headlines")

    return bigrams_freq_and_scores
Exemplo n.º 12
0
def stem_tokenize(str_use):
    """
    Takes a string and tokenizes it, stripping it of punctuation and stopwords. Returns a list of strings.
    """
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(string.punctuation)
    stopwords.append('')
    addstopwords = ["in", "on", "of", "''"]
    stopwords.append(addstopwords)
    stemmer = wordnet.WordNetLemmatizer()
    tokenizer = punkt.PunktWordTokenizer()

    # removes stopwords and punctuation, then splits the string into a list of words
    token = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(str_use)
             if token.lower().strip(string.punctuation) not in stopwords]
    text = [word for word in token if re.search(r'[a-zA-Z]', word) is not None]
    stem = [stemmer.lemmatize(word) for word in text]
    # Returns a list of strings
    return stem
Exemplo n.º 13
0
def count_words(text):
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.append('https')
    stopwords.append('http')
    stopwords.append('im')
    stopwords.append('# ')
    # RegEx for stopwords
    RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords))
    # replace '|'-->' ' and drop all stopwords

    return (text.str.lower().replace([r'\|', RE_stopwords], [' ', ''],
                                     regex=True).str.cat(sep=' ').split())
Exemplo n.º 14
0
 def displaywordcloud(data=None,
                      backgroundcolor='#fff',
                      width=1000,
                      height=1000):
     stopwords.append(product_name)
     wordcloud = WordCloud(
         font_path='C:Windows/Fonts/NanumGothicCoding-Bold.ttf',
         mask=mask_png,
         stopwords=stopwords,
         collocations=False,
         max_font_size=160,
         colormap='tab10',
         background_color=backgroundcolor,
         width=width,
         height=height).generate(data)
     fig = plt.figure(figsize=(10, 10))
     plt.imshow(wordcloud, interpolation="bilinear", aspect='auto')
     plt.axis("off")
     # plt.show
     fig.savefig('webservice/static/wordcloud.png')
Exemplo n.º 15
0
def Cleaning(liste):
    import re 
    import nltk 
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    ps = PorterStemmer()
    stopwords = list(set(stopwords.words('english')))
    Liste = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
    for i in range(0,len(Liste)):
        stopwords.append(Liste[i])
    corpus = []
    for i in range(0,len(liste),1):
        review = re.sub('[^a-zA-Z]',' ',liste[i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(i) for i in review if i not in stopwords]
        review = ' '.join(review)
        corpus.append(review)
    return corpus
    def is_ci_stem_stopword_set_match(self, a, b, threshold=0.5):
        # Get default English stopwords and extend with punctuation
        stopwords = nltk.corpus.stopwords.words('english')
        stopwords.extend(string.punctuation)
        stopwords.append('')

        # Create tokenizer and stemmer
        tokenizer = nltk.tokenize.punkt.PunktWordTokenizer()
        stemmer = nltk.stem.snowball.SnowballStemmer('english')
        """Check if a and b are matches."""
        tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \
                    if token.lower().strip(string.punctuation) not in stopwords]
        tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \
                    if token.lower().strip(string.punctuation) not in stopwords]
        stems_a = [stemmer.stem(token) for token in tokens_a]
        stems_b = [stemmer.stem(token) for token in tokens_b]

        # Calculate Jaccard similarity
        ratio = len(set(stems_a).intersection(stems_b)) / float(
            len(set(stems_a).union(stems_b)))
        return (ratio >= threshold)
Exemplo n.º 17
0
def nounphrase(tree):
    stopwords=[]
    querywords=[]
    grammar = "verb:{<VBG|VBN|VB.?|MD|RP>+}"
    res,verbtree=findChunkwithPOSTags(cstr,grammar,"verb")
#     print("res",res,"sentence",verbtree)
    #print(len(verbtree))
    if(len(verbtree)>0):
        for subtree in verbtree.subtrees():
            for words in subtree:
                stopwords.append(words[0]) 
        for subtree in tree.subtrees():
                for words in subtree:
                    querywords.append(words[0])
    #             print("querywords",querywords)
        resultwords  = [word for word in querywords if word not in stopwords]
        resultwords = ' '.join(str(e) for e in resultwords)
        #print("rs",str(resultwords))
        return 1,resultwords
    else:
        return 0,""
    def update_stopwords(self,
                         add_words=[],
                         remove_words=[],
                         update_corpus=True):
        stopwords = self.stopwords
        [stopwords.append(x) for x in add_words]

        [stopwords.remove(x) for x in remove_words if x in stopwords]

        self._stopwords_ = stopwords
        if update_corpus:
            self.prepare_corpus()
Exemplo n.º 19
0
def ques2_2(entiresentence,string,np,x,y):
    np=""
    stopwords=[]
    querywords = string.split()
    grammar =  "chunk:{<IN>+<DT>?<RB.?>*<JJ.?>*<NN.?|PRP|PRP$|POS|VBG|DT|CD|VBN>+}"
    res,sentence=findChunk(string,grammar,"chunk")
    if(res!=0):
        for words in sentence:
            stopwords.append(words[0])
        #print("sp",stopwords)
        stopwords.append(np)
        stopwords.append(x)
    #     stopwords.append(y)
        #print(stopwords)
        ########FIND THE PREPOSTION
        preposition=""
        for words in sentence:
                #print("words in sentence",words)
                if(words[1]=='IN'):
                        #print("prep found",words[0])
                        preposition=words[0]
                        #print("preposition is",preposition)
                        #print("///////////////////////////////////")
        resultwords  = [word for word in querywords if word not in stopwords]
        resultwords = ' '.join(str(e) for e in resultwords)
        #print(resultwords)
        prepques=preposition+" what "+x+np+" "+resultwords+"?"
        #print(prepques)
        AnsQues[entiresentence].append(prepques)
def getStopWordList(stopWordFile):
    stopwords = []
    stopwords.append("AT_USER")
    stopwords.append("URL")

    with open(stopWordFile, 'r') as f:
        reader = csv.reader(f)
        for w in reader:

            stopwords.append(w[0])

    return stopwords
Exemplo n.º 21
0
print("Dataset Loaded...\n")

# Tokenization and Stemming
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')


def tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]


#Adding new stopwords
stopwords = nltk.corpus.stopwords.words('english')
with open('StopWords.txt', 'r') as file:
    for i in file:
        stopwords.append(i.strip())

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(stop_words=stopwords, tokenizer=tokenize)
X = tfidf.fit_transform(data)
words = tfidf.get_feature_names()
print(words)
print("Number of features: " + str(len(words)))  #check number of features
print("Vectorization Completed...\n")
'''
# Clustering using K-Means
print("k-Means with 5 cluster:\n")
kmeans_1 = KMeans(init='k-means++', n_clusters = 5, random_state = 42)
kmeans_1.fit(X)
common_words = kmeans_1.cluster_centers_.argsort()[:,-1:-21:-1]
for num, centroid in enumerate(common_words):
import requests
import string

#reads from the file defined in project specs on ecampus
url = "http://www.gutenberg.org/cache/epub/9845/pg9845.txt"
r = requests.get(url, allow_redirects=True)

# alternate:
# raw = "your string here" or read in a txt file

#tokenize to keep only words
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(r.text)

#get stopwords + extra for the era of the book at the url
stopwords = stopwords.words('english')
extrastop = ["thee", "thy", "ye", "thine", "thou"]
stopwords.append(extrastop)
nostop = [w for w in tokens if not w in stopwords]

#remove repeats
norepeat = list(set(nostop))
#add all of the words + their initial location to a dictionary
dictionary = {}
for w in norepeat:
    dictionary[w.upper().encode('utf8')] = r.text.find(w)

#write to file
with open('tokens.txt', 'w') as file:
    file.write(str(dictionary))
Exemplo n.º 23
0
def FeaturizeFile(df):
    # df = pd.read_csv(CSVfile,encoding = 'latin1')

    stats = []
    attribute_name = []
    sample = []
    id_value = []
    i = 0

    castability = []
    number_extraction = []

    avg_tokens = []
    ratio_dist_val = []
    ratio_nans = []

    keys = list(df.keys())

    attribute_name.extend(keys)
    summary_stat_result = summary_stats(df, keys)
    stats.extend(summary_stat_result)
    samples = get_sample(df, keys)
    sample.extend(samples)

    # castability.extend(castability_feature(df, keys))
    # number_extraction.extend(numeric_extraction(df, keys))

    # avg_tokens.extend(get_avg_tokens(samples))
    ratio_dist_val.extend(get_ratio_dist_val(summary_stat_result))
    ratio_nans.extend(get_ratio_nans(summary_stat_result))

    csv_names = [
        'Attribute_name', 'total_vals', 'num_nans', 'num_of_dist_val', 'mean',
        'std_dev', 'min_val', 'max_val', '%_dist_val', '%_nans', 'sample_1',
        'sample_2', 'sample_3', 'sample_4', 'sample_5'
    ]
    golden_data = pd.DataFrame(columns=csv_names)

    for i in range(len(attribute_name)):
        # print(attribute_name[i])
        val_append = []
        val_append.append(attribute_name[i])
        val_append.extend(stats[i])

        val_append.append(ratio_dist_val[i])
        val_append.append(ratio_nans[i])

        val_append.extend(sample[i])
        #     val_append.append(castability[i])
        #     val_append.append(number_extraction[i])
        #     val_append.append(avg_tokens[i])

        golden_data.loc[i] = val_append
    #     print(golden_data)

    curdf = golden_data

    for row in curdf.itertuples():

        # print(row[11])
        is_list = False
        curlst = [row[11], row[12], row[13], row[14], row[15]]

        delim_cnt, url_cnt, email_cnt, date_cnt = 0, 0, 0, 0
        chars_totals,word_totals,stopwords,whitespaces,delims_count = [],[],[],[],[]

        for value in curlst:
            word_totals.append(len(str(value).split(' ')))
            chars_totals.append(len(str(value)))
            whitespaces.append(str(value).count(' '))

            if del_reg.match(str(value)): delim_cnt += 1
            if url_reg.match(str(value)): url_cnt += 1
            if email_reg.match(str(value)): email_cnt += 1

            delims_count.append(len(delimeters.findall(str(value))))

            tokenized = word_tokenize(str(value))
            # print(tokenized)
            stopwords.append(len([w for w in tokenized if w in stop_words]))

            try:
                _ = pd.Timestamp(value)
                date_cnt += 1
            except ValueError:
                date_cnt += 0

        # print(delim_cnt,url_cnt,email_cnt)
        if delim_cnt > 2: curdf.at[row.Index, 'has_delimiters'] = True
        else: curdf.at[row.Index, 'has_delimiters'] = False

        if url_cnt > 2: curdf.at[row.Index, 'has_url'] = True
        else: curdf.at[row.Index, 'has_url'] = False

        if email_cnt > 2: curdf.at[row.Index, 'has_email'] = True
        else: curdf.at[row.Index, 'has_email'] = False

        if date_cnt > 2: curdf.at[row.Index, 'has_date'] = True
        else: curdf.at[row.Index, 'has_date'] = False

        curdf.at[row.Index, 'mean_word_count'] = np.mean(word_totals)
        curdf.at[row.Index, 'std_dev_word_count'] = np.std(word_totals)

        curdf.at[row.Index, 'mean_stopword_total'] = np.mean(stopwords)
        curdf.at[row.Index, 'stdev_stopword_total'] = np.std(stopwords)

        curdf.at[row.Index, 'mean_char_count'] = np.mean(chars_totals)
        curdf.at[row.Index, 'stdev_char_count'] = np.std(chars_totals)

        curdf.at[row.Index, 'mean_whitespace_count'] = np.mean(whitespaces)
        curdf.at[row.Index, 'stdev_whitespace_count'] = np.std(whitespaces)

        curdf.at[row.Index, 'mean_delim_count'] = np.mean(whitespaces)
        curdf.at[row.Index, 'stdev_delim_count'] = np.std(whitespaces)

        if curdf.at[row.Index,
                    'has_delimiters'] and curdf.at[row.Index,
                                                   'mean_char_count'] < 100:
            curdf.at[row.Index, 'is_list'] = True
        else:
            curdf.at[row.Index, 'is_list'] = False

        if curdf.at[row.Index, 'mean_word_count'] > 10:
            curdf.at[row.Index, 'is_long_sentence'] = True
        else:
            curdf.at[row.Index, 'is_long_sentence'] = False

        # print(np.mean(stopwords))

        # print('\n\n\n')

    golden_data = curdf

    return golden_data
Exemplo n.º 24
0
    for word in text:
        if word in stopwords_set:
            continue
        if words.get(word) is None:
            words[word] = 1
        else:
            words[word] = words[word] + 1

word_lis = []
for word, no in words.items():
    word_lis.append([word, no])
word_lis = pd.DataFrame(word_lis, columns=["word", "freq"])
word_lis = word_lis.sort_values(by="freq", ascending=False)

for word in word_lis['word'][:20]:
    stopwords.append(word)
# print stopwords

# In[5]:

## Remove stop words


def s_wor_rm(text):
    words = []
    text = text.split()
    for word in text:
        if word in stopwords:
            continue

        words.append(word)
Exemplo n.º 25
0
def http (text):
	stopwords=[]
	stopwords.append([w for w in text if 'https' and 'http' in w ])
	mynewtext = [w for w in text if w not in stopwords[0]]
	return mynewtext
Exemplo n.º 26
0
def remove_hashtag(sentence):
	stopwords=[]
	base = tokinizer(sentence)
	stopwords.append([w for w in base if w.startswith('#')])
	mynewtext = [w for w in base if w not in stopwords[0]]
	return WordList_to_sentence(mynewtext)
Exemplo n.º 27
0
            dict[subject] = count

    return dict


if __name__ == "__main__":
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer("english")

    ## get stop words and normalize
    initial_stopwords = stopwords()
    stopwords = list()
    for s in initial_stopwords:
        s = normalize_string(s)
        stopwords.append(s)

    print("Loading index")
    mention_dict = load_index("../data/surface_forms_new.txt")
    subject_predicates_dict = [
    ]  # load_subject_predicates("data/SimpleQuestions_v2/freebase-FB2M.txt")
    subject_triple_counts = [
    ]  #load_subject_triple_counts("data/subject_triple_counts.txt")

    dataset_names = ["test"]
    max_ngram_size = 10
    exclude_small_ngrams = True
    exclude_stop_words = True

    for d in dataset_names:
        correct_count = 0
Exemplo n.º 28
0
from ML_models import train_svm,train_knn,knn_accuracies,train_log_regression,\
    log_regression_stats,svm_stats,train_random_forest
from dataset_builder import add_sub_pol_to_dataset

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from wordcloud import WordCloud
from nltk.corpus import stopwords

###################### Settings ######################
root_path = os.path.dirname(os.path.realpath(__file__))
ds_path = os.path.join(root_path, '..', 'common', 'dataset.csv')
clean_ds_path = os.path.join(root_path, '..', 'common', 'clean_dataset.csv')
stopwords = stopwords.words('french')
stopwords.append('a')
stopwords.append('e')
stopwords.append('Tre')
stopwords.append('cest')
amazon_img_path = 'https://i1.wp.com/www.joptimisemonsite.fr/wp-content/uploads/2015/02/logo-amazon.jpg?fit=810%2C295&ssl=1&is-pending-load=1'
#######################################################


###################### Utils ######################
@st.cache
def load_data():
    ds = pd.read_csv(ds_path)
    clean_ds = pd.read_csv(clean_ds_path)
    clean_ds = add_sub_pol_to_dataset(clean_ds)
    clean_ds = clean_ds.drop(clean_ds[clean_ds.Subjectivity > 1.0].index)
    return ds, clean_ds
Exemplo n.º 29
0
def cluster_topics(json_data):
    import json
    import time
    import nltk
    import re
    import numpy as np
    import scipy as sp
    import math
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import normalize


    def tokenize_and_stem(text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            if re.search('[a-zA-Z]', token) and token not in stopwords:
                filtered_tokens.append(token)
        stems = [stemmer.stem(t) for t in filtered_tokens]
        return stems


    def tokenize_only(text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens :
            if re.search("[a-zA-Z]", token) and token not in stopwords:
                filtered_tokens.append(token)
        return filtered_tokens

    #data = json.loads(open("room_messages.json").read())
    data = json.loads(open(json_data).read())
    stopwords = stopwords.words("english")
    stopwords.append('blah')
    stemmer = SnowballStemmer("english")

    # Extract messages
    messages = []
    time_stamps = []
    for i in range(len(data["items"])):
        if "text" in data["items"][i]:
            message = (data["items"][i]["text"])
            time_stamp = data["items"][i]["created"]
            t = float(re.search("[0-9]*:[0-9]*", time_stamp).group(0).replace(":", "."))
            messages.append(message)
            time_stamps.append(t)

    # Extract stemmed and tokenized vocab
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for i in messages:
        allwords_stemmed = tokenize_and_stem(i)  # for each item in 'synopses', tokenize/stem
        totalvocab_stemmed.extend(allwords_stemmed)  # extend the 'totalvocab_stemmed' list

        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)

    # vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed)
    # print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

    # Cluster messages according to topic using K-means
    # define vectorizer parameters
    if len(allwords_stemmed) > 200:
        tfidf_vectorizer = TfidfVectorizer(max_df=0.99, max_features=200000,
                                        min_df=0.01, stop_words='english',
                                        use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
    else:
        tfidf_vectorizer = TfidfVectorizer(max_df=1.0, max_features=200000,
                                           min_df=0.0, stop_words='english',
                                           use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))

    # fit the vectorizer to messages
    # t1 = time.clock()
    tfidf_matrix = tfidf_vectorizer.fit_transform(messages)
    # t2 = time.clock()
    # print("Tf-Idf fit time: ", t2-t1)
    # print(tfidf_matrix.shape)
    terms = tfidf_vectorizer.get_feature_names()

    # insert additional time feature
    new_column = np.asarray(time_stamps).reshape(-1, 1)
    time_norm = normalize(new_column)
    # print(new_column.shape)
    final = sp.sparse.hstack((tfidf_matrix, new_column))
    # print(final.shape)
    terms.append('time')

    # calculate distance between messages using cosine similarity of tf-idf
    dist = 1 - cosine_similarity(final)

    # K-means clustering
    # num_clusters = 5
    if len(messages) > 10:
        num_clusters = math.floor(math.sqrt(len(messages)) / 2)
    else:
        num_clusters = 1

    km = KMeans(n_clusters=num_clusters)
    t3 = time.clock()
    km.fit(final)
    t4 = time.clock()
    #print("K-means fit time: ", t4 - t3)
    clusters = km.labels_.tolist()

    topics = {}
    for t in range(num_clusters):
        t_name = "topic" + str(t)
        topics[t_name] = {}
        t_messages = []
        for i in range(len(messages)):
            if clusters[i] == t:
                t_messages.append(messages[i])
        topics[t_name]['messages'] = t_messages

        #Export as json
        # with open('topics.json', 'w') as outfile:
        #json.dump(topics, outfile)


    # Inspect clusters
    # sorted_messages = {'message': messages, 'cluster': clusters}
    #
    # frame = pd.DataFrame(sorted_messages, index=[clusters], columns=['message', 'cluster'])
    # print(frame['cluster'].value_counts())
    #
    # # top words per cluster
    # print("Top terms per cluster:")
    # print()
    # # sort cluster centers by proximity to centroid
    # order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    #
    # for i in range(num_clusters):
    #     print("Cluster %d words:" % i, end='')
    #
    #     for ind in order_centroids[i, :6]:  # replace 6 with n words per cluster
    #         print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    #     print()  # add whitespace
    #     print()  # add whitespace
    #
    #     print("Cluster %d messages:" % i, end='')
    #     for message in frame.ix[i]['message'].values.tolist():
    #         print(' %s,' % message, end='')
    #     print()  # add whitespace
    #     print()  # add whitespace

    return topics
Exemplo n.º 30
0
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse.csr import csr_matrix
import re, pdb, os, json
import numpy as np
from collections import Counter
import nltk

np.random.seed(1234)

lemmatizer = WordNetLemmatizer()
pattern_train = r"(\d*),\"(.*)\",(\d*)"
pattern_test = r"(\d*),\"(.*)\""
stopwords = list(stopwords.words("english"))
stopwords.append("__EOS__")

sw = [
    "right",
    "love",
    "people",
    "feel",
    "yeah",
    "one",
    "see",
    "something",
    "want",
    "year",
    "yes",
    "still",
    "kind",
Exemplo n.º 31
0
from nltk import bigrams, trigrams
import math
import json
from nltk.stem import WordNetLemmatizer
stopwords = nltk.corpus.stopwords.words('english')
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)
#st = LancasterStemmer()
wnl = WordNetLemmatizer()
keywords=[]
with open('keywords.txt','r') as f:
    for i in f:
        keywords.append(i.strip())    

with open('stopwords.txt','r') as f:
    for i in f:
        stopwords.append(i.strip())          
def freq(word, doc):
    return doc.count(word)
 
 
def word_count(doc):
    return len(doc)
 
 
def tf(word, doc):
    return (freq(word, doc) / float(word_count(doc)))
    
 
def calcu_tf(keyword):
    url = "http://en.wikipedia.com/wiki/"+keyword
    content = urllib2.urlopen(url).read()
def get_ngram(filename = None,_type=None,is_stopword=None):
    file_content = open(filename).read()
    # Get the tockens. Use word punctuations for tokenizing too other than spaces
    tokens = nltk.word_tokenize(file_content)
    text = nltk.Text(tokens)
    word_filter = lambda *w: word_to_find not in w
    ## Bigrams
    # ENABLE STOP WORDS
    #stopwords = stopwords.words('english')
    stopwords = []
    stopwords.append('.')
    #stopwords.append('The')
    stopwords.append(':')
    stopwords.append(',')
    stopwords.append(';')
    stopwords.append('`')
    stopwords.append('``')
    stopwords.append('\"')
    #print stopwords

    if is_stopword == 1:
        filtered_tokens = []
        for ftoken in tokens:
            ftoken_low = ftoken.lower()
            if ftoken not in stopwords:
                #print "Removing ######### " + ftoken
                filtered_tokens.append(ftoken)
    else:
         filtered_tokens = tokens


    if _type == 2:
        finder = BigramCollocationFinder.from_words(filtered_tokens,window_size=3)
    else:
        finder = TrigramCollocationFinder.from_words(filtered_tokens,window_size=3)
    # only bigrams that appear 3+ times
    finder.apply_freq_filter(1)
    lst = list(finder.ngram_fd.viewitems())
    #print lst
    ll = sorted(lst, key=lambda x: x[1])
    #ll = lst.sort(key=lambda x: x[0])
    #print ll
    res =[]
    for i in ll:
        k1 = []
        k  = list(i)
        k1.append(' '.join(k[0]))
        k1.append(k[1])
        res.append(k1)
        #raw_input()
        #print k1
    #print res
    return res
Exemplo n.º 33
0
		rows = cursor.fetchall()
	except:
		print "Pid : ",pid," not found"
		return
	return rows
#print nltk.pos_tag(['flipkart','samsung'])

#lemmatiser and stopwords initialization
lemmatizer = nltk.WordNetLemmatizer()
from nltk.corpus import stopwords

#building the stopwords list
stopwords = stopwords.words('english')
stoplist = ['>','<','%','.','br/','(',')','=','!']
for i in stoplist:
	stopwords.append(i)

#normalise each qualified word
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = lemmatizer.lemmatize(word)
    return word

def armAssoc(dust,strength):
	print

def writetofile(l):
	feat=[]
	remove=["flipkart","problem","time","product","awesome","thing","port","delivery","buying","perfect","mode","reason","anything","point","excellent","hand","till","fact","market","weather","brand","life","option","guide","money"]
	for i in range(0,len(l)):		
Exemplo n.º 34
0
import bs4 as bs
import json
import re
import nltk
import heapq
import pickle
import sys
from pprint import pprint
import os
import codecs
import string
from sklearn_crfsuite import metrics
from DataExtraction import convertCONLLFormJustExtractionSemEvalPerfile
from FeatureExtraction import sent2labels, sent2features
from PhraseEval import phrasesFromTestSenJustExtractionWithIndex
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import conlltags2tree, tree2conlltags
#Swapped hardcoded link with system arguement
file_inLoc = sys.argv[1]
file_outLoc = sys.argv[1].split(".")[0] + "-DKE.txt"
file_outLoc = file_outLoc[15:]
with open(file_inLoc, 'r', encoding='utf-8-sig') as f:
    article_text = json.load(f)
pprint(article_text)
'''
scraped_data = urllib.request.Request(file_inLoc, headers={'User-Agent' : "Magic Browser"})
scraped_data=urllib.request.urlopen(scraped_data)
parsed_article = bs.BeautifulSoup(article,'lxml')
paragraphs = parsed_article.find_all('p')
article_text = ""
Exemplo n.º 35
0
    words = tknzr.tokenize(words)
    exclude = set(string.punctuation)
    words2 = [word for word in words if
            not word in exclude]
    words_tag = dict(pos_tag(words))
    words = [word.lower() for word in words2 if
            not word in nltk.corpus.stopwords.words('english') and not word.isdigit()]
    # print(words)
    words = [lima(word, words) for word in words]
    # print(words)
    words = ' '.join(words)
    # print(words)
    return words

stopwords = stopwords.words('english')
stopwords.append('.')
# stopwords.union('sally')
# operators = set(('sally'))
# stop = set(nltk.corpus.stopwords.words('english')) + operators
# print(stopwords)
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
chapters=[]
with open('user_posts_1641812829207516.csv') as File:
    tfidfReader = csv.reader(File)
    for row in tfidfReader:
        chapters.append(clean(row[0]).encode('utf-8'))
num_chapters = len(chapters)
fvs_lexical = np.zeros((len(chapters), 3), np.float64)
fvs_punct = np.zeros((len(chapters), 3), np.float64)
i=1
def analyze(soup_object, csv_name):

    #select only the headlines in each google search result
    base = soup_object.select("div.g.card h3")

    #declare empty list where I'm going to put all the headlines
    headlines = []

    #loop to get rid of all html and keep only headline text
    for row in base:
        clean = row.text
        headlines.append(clean)

    #print to verify headlines are clean
    #print(headlines)

    #empty list to store tokenized headlines
    tokens = []

    #loop to tokenize headlines by using clean_tokens method from earlier
    for each in headlines:
        clean = clean_tokens(each)
        tokens.append(clean)

    #print tokens to verify
    #print('tokens =',tokens)

    #create stopwords list from nltk and add fallout 76 as stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.append("fallout")
    stopwords.append('76')
    stopwords.append("'fallout")

    #remove stopwords from tokens
    filtered = []
    for list in tokens:
        x = []
        for word in list:
            if word not in stopwords:
                x.append(word)
        filtered.append(x)

        #print to verify stopwords are gone
    #print("filtered = ", filtered)

    #declare empty list so I can put the tokens back into headlines without stopwords
    combined = []

    #put tokens back into headlines without stopwords
    for list in filtered:
        combined.append(" ".join(list))

    #import sentiment analyzer
    from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

    #create sia object I think and then create empty list to put end results in
    sia = SIA()
    results = []

    #analyze sentiment of each headline
    for line in combined:
        pol_score = sia.polarity_scores(line)
        results.append(pol_score)

    #print to verify
    #for i in results:
        #print(i)


    #write sentiment analysis to csv file
    with open('resultsfinal.csv', 'a') as csv_file:
        writer = csv.writer(csv_file)
        for d in results:
            writer.writerow(['compound', d['compound']])
Exemplo n.º 37
0
import nltk

tweets = pd.read_csv('../data/primary_debates_cleaned.csv')
tweets = tweets.drop(['URL','Location','Date','Line'], axis=1)
tweets = tweets.loc[tweets['Speaker'].isin(['Bush', 'Carson', 'Chafee', 'Christie', 'Clinton', 'Cruz', 'Fiorina', 'Gilmore', 'Graham', 'Huckabee', 'Jindal', 'Kasich', "O'Malley", 'Pataki', 'Paul', 'Perry', 'Rubio', 'Sanders', 'Santorum', 'Trump', 'Walker', 'Webb'])]
tweets = tweets.loc[~tweets['Text'].isin(['(APPLAUSE)', '(ANTHEM)', '(BELL)', '(BOOING)', '(COMMERCIAL)', '(CROSSTALK)', '(LAUGHTER)', '(MOMENT.OF.SILENCE)', '(SPANISH)', '(VIDEO.END)', '(VIDEO.START)', '(inaudible)'])]
#print(tweets)
#print(tweets.Tweet[0])

democrat = tweets[tweets.Party == 'Democratic']

republican = tweets[tweets.Party == 'Republican']

stopwords = stopwords.words('english')
#add some unnecessary word to stopwords list
stopwords.append("rt")
stopwords.append("u")
stopwords.append("amp")
stopwords.append("w")
stopwords.append("th")

clean_democrat = []
for d in democrat.Text:
    d = re.sub(r'https\S+', '', d)
    d = re.sub("[^a-zA-Z]", " ", d)
    d = d.lower()
    d = nltk.word_tokenize(d)
    d = [word for word in d if not word in set(stopwords)]
    lemma = nltk.WordNetLemmatizer()
    d = [lemma.lemmatize(word) for word in d]
    d = " ".join(d)
Exemplo n.º 38
0
 def pullAllTheStops(self):
     stopwords = []
     for word in self.words:
         if word not in self.stops:
             stopwords.append(word)
     return stopwords
Exemplo n.º 39
0
import csv
import random
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

stopwords = stopwords.words("english")
stopwords.append('.')
stopwords.append('?')
stopwords.append('\'')
stopwords.append(',')
stopwords.append('’')
stopwords.append(')')
stopwords.append('(')
stopwords.append('/')

class DataLoader:
    """ DataLoader Utility """
    def __init__(self, file):
        self.file = file
        self.documents = []
        self.all_words = []
        self.questions = []
        self.answers = []
        self.questions_and_answer = []

    # doc - document words and dictionary - corpus words
    def get_feature(self, doc, dictionary):
        vector = {}
        words = set(doc)
        for w in dictionary:
Exemplo n.º 40
0
    def perturb(self, stringseq, method=0):
        assert len(stringseq) > 0, "sentence must be more than one char!"
        assert any(
            c.lower() in string.ascii_lowercase for c in stringseq
        ), "the sentence has to contain at least one alphabet letter!"
        tokens = nltk.wordpunct_tokenize(stringseq)
        # print (tokens)
        #if method is not specified, we randomly select one.
        if method == 0:
            method = randint(1, 4)
        elif method < 0:
            method = randint(1, 4 + method)  # eliminate certain methods

        # qwerty
        if method == 1:
            while True:
                randidx = randint(0, len(tokens) - 1)
                token = tokens[randidx]
                #has to be at least one alpha letter
                if not any(c.lower() in string.ascii_lowercase for c in token):
                    continue
                res = self.pert_qwerty.perturb(token)
                if res[0] == res[1].lower():
                    continue
                fr, to = self.spans(tokens, stringseq, randidx)
                return (stringseq[:fr] + res[0] + stringseq[to:], res[1],
                        (fr + res[2][0], res[2][1]), 0)
        # drop
        elif method == 2:

            stopwords = []
            for i, token in enumerate(tokens):
                if token in self.stopwords:
                    stopwords.append(i)

            if len(stopwords) > 0:
                randidx = choice(stopwords)
                token = tokens[randidx]
            else:
                randidx = randint(0, len(tokens) - 2)
                token = tokens[randidx]

            fr, to = self.spans(tokens, stringseq, randidx)
            # remove a whitespace if drop the word
            if to < len(stringseq) and stringseq[to] == ' ':
                # trailing whitespace
                return (stringseq[:fr] + stringseq[to + 1:], tokens[randidx],
                        (fr, 0), 2)
            elif stringseq[fr - 1] == ' ':
                # leading whitespace
                return (stringseq[:fr - 1] + stringseq[to + 1:],
                        tokens[randidx], (fr - 1, 0), 1)
            else:
                return (stringseq[:fr] + stringseq[to + 1:], tokens[randidx],
                        (fr, 0), 0)
        # delete chars
        elif method == 3:
            pos_tags = nltk.pos_tag(tokens)
            cnt = 0
            while True:
                cnt += 1
                if cnt > 5:
                    break
                randidx = randint(0, len(tokens) - 2)
                token = tokens[randidx]
                if not any(c.lower() in string.ascii_lowercase for c in token):
                    continue
                if len(token) < 2 or pos_tags[randidx] in ["TO", "SYM"]:
                    continue
                res = self.pert_delete.perturb(token, pos_tags[randidx])
                fr, to = self.spans(tokens, stringseq, randidx)
                return (stringseq[:fr] + res[0] + stringseq[to:], res[1],
                        (fr + res[2][0], res[2][1]), 0)
            return self.perturb(stringseq, -2)
        # synonym
        elif method == 4:
            cnt = 0
            while True:
                cnt += 1
                if cnt > 5:
                    # in case of dead loop
                    break
                randidx = randint(0, len(tokens) - 2)
                token = tokens[randidx]
                # has to be a word
                if not all(c.lower() in string.ascii_lowercase for c in token):
                    continue
                res = self.pert_synonym.perturb(token)
                if res[0] == res[1]:
                    continue
                fr, to = self.spans(tokens, stringseq, randidx)
                return (stringseq[:fr] + res[0] + stringseq[to:], res[1],
                        (fr, res[2][1]), 0)
            return self.perturb(stringseq, -1)
Exemplo n.º 41
0
        dic.append(
            {'구분': 'trigram', '순위': '1~20', '단어': unicode(trigram[0]) + unicode(trigram[1]) + unicode(trigram[2]),
             '횟수': str(dic[19]['횟수']) + u"회 이상"})

    csv_columns = ['구분', '순위', '단어', '횟수']

    csv_file = filename.split('.')[0] + "freq.csv"

    WriteDictToCSV(csv_file, csv_columns, dic)

stopwords = []
stoptext = open("stopwords.txt", "r")
for sw in stoptext.readlines():
    sw = sw.decode('cp949')
    sw = re.sub('\\n', '', sw)
    stopwords.append(sw)


measures = collocations.BigramAssocMeasures()
measures2 = collocations.TrigramAssocMeasures()



def remove_values_from_list(the_list, val):
   return [value for value in the_list if value != val]

pwd = os.getcwd()+'\\document'


for path, dirs, files in os.walk(pwd):
    for file in files:
Exemplo n.º 42
0
# coding=utf-8
import math
from textblob import TextBlob as tb
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


#STOPWORDS
stopwords = stopwords.words('portuguese')
stopwords.append('pra')


def remove_stopwords (sentences):
    phrases = []
    for sentence in sentences:
        #Gera tokens
        tokens = []
        words = word_tokenize(sentence)
        for word in words:
            if word.lower() not in stopwords:
                tokens.append(word)
        phrases.append(' '.join(tokens))
    return phrases

def get_cosine(vec1, vec2):
    size = len(vec1) - 1
    numerator = sum([vec1[x] * vec2[x] for x in range(size)])

    sum1 = sum([vec1[x]**2 for x in range(size)])
    sum2 = sum([vec2[x]**2 for x in range(size)])
Exemplo n.º 43
0
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')

consumerKey="xxxx"
consumerSecret="xxxx"
accessToken="xxxx-xxxx"
accessSecret="xxxxx"

stopwords = []
for i in open("stopwords.txt"):
    sword = i.rstrip('\n')
    stopwords.append(sword)

global check
check = []

class listener(StreamListener):
    x = 0
    y = 0
    def on_data(self, data):
        all_data = json.loads(data)
        tweet = all_data["text"]
        tweet = re.sub(r"http\S+", "", tweet)
        analysis = TextBlob(tweet)
        stop_words = set(stopwords)
        filtered_words = set(analysis.words.lower()) - stop_words # remove stop words from tweet
        for i in set(filtered_words): # remove @ tags