示例#1
0
def porter_list1(lista):
    stemmer = PorterStemmer()
    newlist = []
    for b in lista:
        b = stemmer.stem(b)
        newlist.append(b)
    return newlist
def splitAndStem(inputfilename, outputfilename):
    '''
    For each ingredient split it into words, stem each word, construct a new recipe from those words
    :param inputfilename:
    :return:
    '''


    with open(outputfilename, 'w') as ff:
        ff.write('[\n')

    with open(inputfilename) as f:
        d = eval(f.read())

    stemmer = PorterStemmer()
    with open(outputfilename, 'a') as ff:
        for i in d:
            # print(i)
            new_item = {}
            new_ingredients = []
            for ingredient in i['ingredients']:
                tokens = word_tokenize(ingredient)
                clean_tokens = [re.subn('[^A-Za-z]', '', token)[0] for token in tokens]
                new_ingredients += [stemmer.stem(w).lower() for w in clean_tokens]
            new_item['cuisine'] = i['cuisine']
            new_item['id'] = i['id']
            new_item['ingredients'] = new_ingredients
            json_recipe = json.dumps(new_item)
            ff.write('%s,\n' % str(json_recipe))
def parseReviews(mypath):
  filelist = os.listdir(mypath) 
  wordDict = {}
  negationList = ["no","not","never","can't","won't","cannot","didn't","couldn't"]
  negationFlag = False
  stopwordList = set(stopwords.words("english"))
  stemmer = PorterStemmer()
  for file in filelist:
    with open(mypath + "/" + file,"r") as f:
      word_list = word_tokenize(f.read())
    for word in word_list:
      if word in negationList:
        #double negative
        if negationFlag:
          negationFlag = False
        else:
          negationFlag = True
        continue
      if not word.isalnum():
        negationFlag = False
      if word.isalnum() and word not in stopwordList:
        word = stemmer.stem(word)
        if negationFlag:
          word = "!" + word
          negationFlag = False
        if word not in wordDict:
          wordDict[word] = 1
        else:
          wordDict[word] += 1
  return wordDict
def tokenizeTags(str,dict_items):
    #temp map (for getting the local term frequency)
    #for a sentence
    str =str.decode('ascii', 'ignore')
    #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    #tokens=tokenizer.tokenize(str)
    tokens = str.split()
    #print tokens
    stemmer = PorterStemmer()
    #small set of stopwords (remove you, are, and, I those kinds of words)
    last =[]
    #bigram_list=[]
    for d in tokens:
        d = d.split('-')
        for c in d:
                c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
                #regular expression -> strip punctuations
                if c!='' and c not in dict_items:
                    try:
                        if int(c):
                            if len(c)!=4 and (c>2015 or c<1900): #keep years
                                c=stemmer.stem('NUM')
                    except Exception:
                        c = stemmer.stem(c.lower())
                        pass
                    #c = stemmer.stem(c.lower())
                    last.append(c)
                    #bigram generation
                #index= len(last)
                #if index>1:
                   # bigram = last[index-2]+' '+last[index-1]
                   # bigram_list.append(bigram)
    return last
def tokenize2_bigram(str,df_freq):
    temp_map={}
    #for a sentence
    str =str.decode('ascii', 'ignore')
    tokens = str.split()
    #print tokens
    stemmer = PorterStemmer()
    last =[]
    bigram_list=[]
    for d in tokens:
        d = d.split('-')
        for c in d:
                c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
                #regular expression -> strip punctuations
                if c!='':
                    try:
                        if int(c):
                            if len(c)!=4 and (c>2015 or c<1900): #keep years
                                c=stemmer.stem('NUM')
                    except Exception:
                        c = stemmer.stem(c.lower())
                        pass
                    
                    #c = stemmer.stem(c.lower())
                    last.append(c)
                    
                    #bigram generation
                index= 0
                if index>1:
                    bigram = last[index-2]+' '+last[index-1]
                    bigram_list.append(bigram)
                    updateDF(temp_map,df_freq,bigram)
                    index+=1
    return bigram_list
示例#6
0
    def _stemmatize(self, word):
        lmtzr = WordNetLemmatizer() # lemmatizer won't stem words ending in '-ing' unless you tell it it's a verb
        stemmer = PorterStemmer()

        if word.endswith('ing'):
            return stemmer.stem(word)
        return lmtzr.lemmatize(word)
示例#7
0
def prepare_data(reviews):
    # run porter stemmer on every word
    stemmer = PorterStemmer()
    stem_text = lambda x: {'class': x['class'],
                           'text': stemmer.stem(x['text'])}

    # clean text and remove empty items
    reviews = filter(lambda x: x != {}, reviews)
    reviews = map(stem_text, reviews)

    print('classification: ' + reviews[observed_element]['class'] + '\n\n------------------------------------\n\n')

    print('stemming: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')

    # remove stopwords
    reviews = map(remove_stop_words, reviews)

    print('stopwords: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')

    # remove undesired patterns
    reviews = map(clean_text, reviews)

    print('elementos inuteis: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')

    return reviews
def deleting_stop_words_and_punctuating(text):
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    word_tokenize_text = word_tokenize(text)
    words = [ps.stem(lemmatizer.lemmatize(w)) for w in word_tokenize_text]
    return [w.lower() for w in words if not (w in stop_words or w in string.punctuation or w in "''" or w in '``' or w in "the" or w in 'in' or w in "'s")]
示例#9
0
def preprocess(text):
  stemmer = PorterStemmer()
  stop = stopwords.words('english')
  tokens = [tok for tok in word_tokenize(text.lower())
    if tok not in stop]
  tokens_stemmed = [stemmer.stem(tok) for tok in tokens]
  return tokens_stemmed    
示例#10
0
文件: sentence.py 项目: Rostlab/relna
class StemmedBagOfWordsFeatureGenerator(EdgeFeatureGenerator):
    """
    Generates stemmed Bag of Words representation for each sentence that contains
    an edge, using the function given in the argument.

    By default it uses Porter stemmer

    :type feature_set: nala.structures.data.FeatureDictionary
    :type stemmer: nltk.stem.PorterStemmer
    :type stop_words: list[str]
    :type training_mode: bool
    """

    def __init__(self, feature_set, stop_words=[], training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.stemmer = PorterStemmer()
        """an instance of the PorterStemmer"""
        self.stop_words = stop_words
        """a list of stop words"""

    def generate(self, dataset):
        for edge in dataset.edges():
            sentence = edge.part.sentences[edge.sentence_id]
            if self.training_mode:
                for token in sentence:
                    if self.stemmer.stem(
                            token.word
                    ) not in self.stop_words and not token.features['is_punct']:
                        feature_name = '4_bow_stem_' + self.stemmer.stem(
                            token.word) + '_[0]'
                        self.add_to_feature_set(edge, feature_name)
示例#11
0
文件: utils.py 项目: Muugii-bs/hommie
def preprocessing(text, debug = False):
    if debug:
        print text

    # lower case
    text = text.lower()
    if debug:
        print text

    # can't -> cannot, bya's -> bya is
    text = replacers.RegexpReplacer().replace(text)
    if debug:
        print text

    # word tokenize
    words = word_tokenize(text)
    if debug:
        print words

    # removing stopwords
    english_stops = set(stopwords.words('english'))
    english_stops_added = english_stops | {'.', ',', ':', ';'}
    words = [word for word in words if word not in english_stops_added]
    if debug:
        print words

    # stemming words
    stemmer = PorterStemmer()
    words_stemmed = list(map(lambda word: stemmer.stem(word), words))
    if debug:
        print words_stemmed

    return words, words_stemmed
示例#12
0
def stem(string):
    """Stem a phrase"""
    global stemmer
    if not stemmer:
        stemmer = Stemmer()
    #words = string.split()
    #for i in range(len(words)):
    #    words[i] = self.stemmer.stem(words[i])
    # stemming last word only
    #string = self._reGlue(words)
    #
    #string2 = stemmer.stem(string)
    #if string2 not in stemdict:
    #    stemdict[string2] = string
    # FIX ME
    if string not in stemdict:
        if bad_unicode(string):
            ## added A. Meyers 8/28/15
            temp = stemmer.stem(remove_non_unicode(string))
        else:
            temp = stemmer.stem(string)
        if temp:
            stemdict[string] = temp
        if not temp:
            pass
        elif temp not in unstemdict:
            unstemdict[temp] = [string]
        elif string not in unstemdict[temp]:
            unstemdict[temp].append(string)
    else:
        temp = stemdict[string]
    return temp
示例#13
0
    def extract_clean_sentences(self):
        """
        Extracts sentences from plain text. Also applies the following cleaning
        operations:
        - Exclude all characters not recognized by 'utf-8' encoding
        - Exclude all characters not contained in [a-zA-Z0-9 '-]
        - Exclude common stopwords
        """

        text = self.raw_text
        
        exclude = re.compile('[^a-zA-Z0-9 \'-]')
        linebreaks = re.compile('\s')
        excess_space = re.compile('\s+')
        stemmer = PorterStemmer()

        sentences = sent_tokenize(text)
        out = []
        for sentence in sentences:
            sentence = linebreaks.sub(' ', sentence)
            sentence = exclude.sub(' ', sentence)
            sentence = excess_space.sub(' ', sentence)
            tokens = word_tokenize(sentence)
            tokens = [stemmer.stem(t.lower()) for t in tokens]
            out.append(tokens)

        return out
def stemming():
    ps = PorterStemmer()
    input_tweet = 'testing tests trying tries'
    words = word_tokenize(input_tweet)

    for w in words:
     print(ps.stem(words))
示例#15
0
def openfile(filename,output):
    print(filename)
    #starts run time
    start = timeit.default_timer()
    ps = PorterStemmer()
    file = open(filename,"r")
    tokens = []

    #Used for removing punctuation from the documents
    translate_table = dict((ord(char), None) for char in string.punctuation)

    start2 = timeit.default_timer()
    #splits the lines into words and removes the punctuation
    for line in file:
        tokens += word_tokenize(line.translate(translate_table)  )
    start3 = timeit.default_timer()
    print("tokenize")
    print(start3 - start2)
        
    #creates a set of stop words to be removed later
    stop_words = set(stopwords.words("english"))

    start6 = timeit.default_timer()
    #if a word is not a stop word it adds it to a list 
    filtered_sentence = []
    for w in tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    start7 = timeit.default_timer()
    print("stop word removal")
    print(start7 - start6)

    startw = timeit.default_timer()    
    #stems each word and adds it to the output file in csv form
    f = open(output,'w')
    iterFilSen = iter(filtered_sentence)
    if output == "documents.csv":
        for w in filtered_sentence:
            if w == "I":
                f.write("\n")
            f.write(ps.stem(w))
            f.write(",")
    else:
        for w in iterFilSen:
            if w == "I":
                f.write("\n")
                #removes the I number W
                next(iterFilSen)
                next(iterFilSen)
            else:
                f.write(ps.stem(w))
                f.write(",")
            
        
    #ends run time
    stop = timeit.default_timer()
    print("writing")
    print(stop - startw)
    print("total: "+output)
    print(stop - start)
示例#16
0
文件: RetKNN_MPRC.py 项目: w2wei/XPRC
 def buildVocab(self):
     '''Build a vocabulary for the selected documents (from dir database).'''
     ## Note: The source of text should be Lucene processed field values. Lucene tokenized the text, remove stop words, and may take other unknown steps.
     ## Right now the vocabulary is built on the raw text with NLTK based stopwords removal, and tokenization. This should be improved.
     # collect contents from /database/ for each of these doc
     for pmid in self.pmidList: # self.pmidList includes the query and the 99 most similar articles selected by BM25
         self.corpus.append(file(os.path.join(self.dbDir,pmid)).read()) # corpus contains raw text (MH, title*2, abstract)
     for text in self.corpus:
         sent_tokenize_list = sent_tokenize(text.strip().lower(), "english") # tokenize an article text
         stemmed_text = []
         if sent_tokenize_list: # if sent_tokenize_list is not empty
             porter_stemmer = PorterStemmer()
             for sent in sent_tokenize_list:
                 words = TreebankWordTokenizer().tokenize(sent) # tokenize the sentence
                 words = [word.strip(string.punctuation) for word in words]
                 words = [word for word in words if not word in stopwords.words("english")]               
                 words = [word for word in words if len(word)>1] # remove single letters and non alphabetic characters               
                 words = [word for word in words if re.search('[a-zA-Z]',word)]                        
                 words = [porter_stemmer.stem(word) for word in words] # apply Porter stemmer                     
                 stemmed_text.append(" ".join(words))
                 self.vocab+=words
         self.stemmed_corpus.append(". ".join(stemmed_text)) # append a stemmed article text
     # save stemmed corpus
     pickle.dump(self.stemmed_corpus, file(os.path.join(self.stemmed_corpusDir,str(self.pmidList[0])),"w"))
     # remove low frequency tokens and redundant tokens
     tokenDist = Counter(self.vocab)
     lowFreqList = []
     for token, count in tokenDist.iteritems():
         if count<2:
             lowFreqList.append(token)
     self.vocab = list(set(self.vocab)-set(lowFreqList))
     # save vocabulary
     pickle.dump(self.vocab,file(os.path.join(self.vocabDir,str(self.pmidList[0])),"w"))
示例#17
0
def parseTranscript(transcript):

    assert isinstance(transcript, Transcript), \
        "transcript must be stored in custom namedtuple, not {}".format(type(transcript))

    text = transcript.prepared.append(transcript.QandA)
    id = "{ticker}-{year}-{month}-{day}".format(ticker=transcript.ticker.split(':')[-1],
                                                year=transcript.date.year,
                                                month=transcript.date.month,
                                                day=transcript.date.day)

    tokenizer = wordpunct_tokenize
    stemmer = PorterStemmer()
    index = dict()
    pos = 0

    for row in text:

        for i, token in enumerate(tokenizer(row.lower())):
            token = stemmer.stem(token)
            if token not in index and '|' not in token:
                index[token] = [id, [str(pos + i)]]
            elif '|' not in token:
                index[token][-1].append(str(pos + i))

        try:
            pos += (i + 1)
        except:
            pass

    return index
示例#18
0
文件: okreader.py 项目: ned2/okdata
def get_english_vocab(lemmatize=False):
    vocab = (w.lower() for w in words.words())

    if lemmatize:
        stemmer = PorterStemmer()
        vocab = (stemmer.stem(w) for w in vocab)
    return set(vocab)
def testing():
    # - tokenize on sentence and word
    ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!"
    print(sent_tokenize(ex_txt))
    print(word_tokenize(ex_txt, language='english'))

    # - stop words (pre-defined by nltk)
    stop_words = set(stopwords.words('english'))
    print(stop_words)
    words = word_tokenize(ex_txt)
    print(words)
    filtered_sent = []
    for w in words:
        if w not in stop_words:
            filtered_sent.append(w)
    print(filtered_sent)
    filtered_sent = [w for w in words if not w in stop_words]
    print(filtered_sent)

    # - stemming
    ps = PorterStemmer()
    example_words = [python,pythoner,pythoning,pythoned,pythonly]
    # for w in example_words:
    #     print(ps.stem(w))
    new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
    words = word_tokenize(new_text)
    for w in words:
        print(ps.stem(w))
示例#20
0
def stemText(text):

    ps = PorterStemmer()
    words = word_tokenize(text)
    #all_words = [];
    for w in words:
        all_words.append(ps.stem(w))
示例#21
0
def preprocess_document(doc):
  stopset = set(stopwords.words('english'))
  stemmer = PorterStemmer()
  tokens = wordpunct_tokenize(doc)
  clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
  final = [stemmer.stem(word) for word in clean]
  return final
def tokenize2(str,df_freq):
    #temp map (for getting the local term frequency)
    temp_map={}
    #for a sentence
    str =str.decode('ascii', 'ignore')
    #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    #tokens=tokenizer.tokenize(str)
    tokens = str.split()
    #print tokens
    stemmer = PorterStemmer()
    #small set of stopwords (remove you, are, and, I those kinds of words)
    
    
    last =[]
    #bigram_list=[]
    for d in tokens:
        d = d.split('-')
        for c in d:
            c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
                #regular expression -> strip punctuations
                if c!='':
                    try:
                        if int(c):
                            if len(c)!=4 and (c>2015 or c<1900): #keep years
                                c=stemmer.stem('NUM')
                    except Exception:
                        c = stemmer.stem(c.lower())
                        pass
                    
                    last.append(c)
                    updateDF(temp_map,df_freq,c)
示例#23
0
def new_lesk(context_sentence, ambiguous_word, pos=None, stem=True, hyperhypo=True):
    ps = PorterStemmer()
    max_overlaps = 0; lesk_sense = None
    context_sentence = context_sentence.split()
    for ss in wn.synsets(ambiguous_word):
        # If POS is specified.
        if pos and ss.pos is not pos:
            continue

        lesk_dictionary = []

        # Includes definition.
        lesk_dictionary+= ss.definition.split()
        # Includes lemma_names.
        lesk_dictionary+= ss.lemma_names

        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            lesk_dictionary+= list(chain(*[i.lemma_names for i in ss.hypernyms()+ss.hyponyms()]))

        if stem == True: # Matching exact words causes sparsity, so lets match stems.
            lesk_dictionary = [ps.stem(i) for i in lesk_dictionary]
            context_sentence = [ps.stem(i) for i in context_sentence]

        overlaps = set(lesk_dictionary).intersection(context_sentence)

        if len(overlaps) > max_overlaps:
            lesk_sense = ss
            max_overlaps = len(overlaps)
    return lesk_sense
示例#24
0
文件: context.py 项目: Rostlab/relna
class IntermediateTokensFeatureGenerator(EdgeFeatureGenerator):
    """
    Generate the bag of words representation, masked text, stemmed text and
    parts of speech tag for each of the tokens present between two entities in
    an edge.

    :param feature_set: the feature set for the dataset
    :type feature_set: nala.structures.data.FeatureDictionary
    :param training_mode: indicates whether the mode is training or testing
    :type training_mode: bool
    """
    def __init__(self, feature_set, training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.stemmer = PorterStemmer()
        """an instance of PorterStemmer"""

    def generate(self, dataset):
        for edge in dataset.edges():
            sentence = edge.part.sentences[edge.sentence_id]
            if edge.entity1.head_token.features['id'] < edge.entity2.head_token.features['id']:
                first = edge.entity1.head_token.features['id']
                second = edge.entity2.head_token.features['id']
                for i in range(first+1, second):
                    token = sentence[i]
                    feature_name = '33_fwd_bow_intermediate_'+token.word+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '34_fwd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '35_fwd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '36_fwd_pos_intermediate_'+token.features['pos']+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
            else:
                first = edge.entity2.head_token.features['id']
                second = edge.entity1.head_token.features['id']
                for i in range(first+1, second):
                    token = sentence[i]
                    feature_name = '37_bkd_bow_intermediate_'+token.word+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '38_bkd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '39_bkd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '40_bkd_pos_intermediate_'+token.features['pos']+'_[0]'
                    self.add_to_feature_set(edge, feature_name)

            for i in range(first+1, second):
                token = sentence[i]
                feature_name = '41_bow_intermediate_'+token.word+'_[0]'
                self.add_to_feature_set(edge, feature_name)
                feature_name = '42_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
                self.add_to_feature_set(edge, feature_name)
                feature_name = '43_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
                self.add_to_feature_set(edge, feature_name)
                feature_name = '44_pos_intermediate_'+token.features['pos']+'_[0]'
                self.add_to_feature_set(edge, feature_name)
def stemming(line):
	stemmer = PorterStemmer()
	line_array = line.split(" ")
	for word in line_array:
		replace_word = stemmer.stem(word)
		#print replace_word	
		line = line.replace(word,replace_word)
	return line
示例#26
0
 def stemming(line):
   from nltk.stem import PorterStemmer
   lmtzr = PorterStemmer()
   words=[]
   for word in line.split(' '):
     words.append(lmtzr.stem(word))
   line = ' '.join(words)
   return line
示例#27
0
	def create_test(self, test_sentence, all_words):
		ps = PorterStemmer()
		words_token = word_tokenize(test_sentence.lower())		
		words_token = [ps.stem(w) for w in words_token]
		# convert numbers to keyword DIGIT
		words_tag = ['DIGIT' if word[1] == 'CD' else word[0] for word in pos_tag(words_token)]
		t = {word: (word in words_tag) for word in all_words}
		return t
def stemming(x,logger):
	logger.info('Stemminization')
	try:
		st=PorterStemmer()
		x = [st.stem(word) for word in x.split()]
		return(" ".join(x))
	except:
		return(x)
示例#29
0
文件: c.py 项目: chengxwcq/ee219
def stem_text(sent, context=None):
    processed_tokens = []
    tokens = word_tokenize(sent)
    porter = PorterStemmer()
    for t in tokens:
        t = porter.stem(t)
        processed_tokens.append(t)
    return " ".join(processed_tokens)
 def stem(self,tokens):
     """method for stemming the tokens using nltk"""        
     stemmer = PorterStemmer() #instantiating the stemmer
     stemmedTokens = set([]) #set for containing the stemmed tokens
     
     for token in tokens:
         stemmedTokens.add(stemmer.stem(token))
     return stemmedTokens
示例#31
0
def dummywhere(text):
    print('Inside Where_Query')
    count = 0
    ps = PorterStemmer()
    pre_cond_query = []
    post_cond_query = []
    select_query = []
    comment1 = [text.lower()]
    #print(text)
    place = GeoText(text)
    #print("Printing comment",comment1)
    for i in range(0, len(comment1)):
        token_comment = word_tokenize(comment1[i])
        #print(token_comment)
    if all(word not in 'who whose whom' for word in token_comment):
        if ('over' in token_comment):
            index1 = token_comment.index('over')
            #print(index1)
            token_comment.insert(index1, 'who')
        elif ('under' in token_comment):
            index1 = token_comment.index('under')
            #print(index1)
            token_comment.insert(index1, 'who')
    comment = [' '.join(map(str, token_comment))]
    for i in range(0, len(comment)):
        token_comment = word_tokenize(comment[i])
        #print(token_comment)
        if ('whose' in token_comment):
            index = token_comment.index('whose')
            prequery = token_comment[:index]
            print("Prequery", prequery)
            postquery = token_comment[index:(len(token_comment))]
            print("Postquery", postquery)

        elif ('who' in token_comment):
            index = token_comment.index('who')
            prequery = token_comment[:index]
            print("Prequery", prequery)
            postquery = token_comment[index:(len(token_comment))]
            print("Postquery", postquery)

    for i in range(0, len(comment)):
        token_comment = word_tokenize(comment[i])
        tagged_comment = pos_tag(token_comment)

    for i in range(0, len(prequery)):
        tagged_comment1 = pos_tag(prequery)
    for word, tag in tagged_comment1:
        if ((tag == 'NNP' or tag == 'NN' and word == 'show'
             or word == 'describe' or word == 'state') or word == 'show'
                or word == 'tell' or word == 'give'):
            pre_cond_query.append('Select')
        if word in AttrList:
            if (word == 'live'):
                word = 'city'
            if (word == 'female' or word == 'girls' or word == 'girl'
                    or word == 'male' or word == 'boys' or word == 'boy'):
                word = 'gender'
            if (word == 'enrolled' or word == 'admission'
                    or word == 'admitted'):
                word = 'registration_date'
            if (word == 'studying' or word == 'studies' or word == 'study'):
                word = 'class'
            if (word == 'roll' or word == 'serial' or word == 'no.'):
                word = 'roll_no'

            pre_cond_query.append(lemmatizer.lemmatize((word)))
    print(pre_cond_query)
    pre_cond_query.extend(['from', 'Student', 'where'])
    if (pre_cond_query[0] != 'Select'):
        pre_cond_query.insert(0, 'Select')
    print("Preconditional Query: ", pre_cond_query)

    for i in range(0, len(postquery)):
        tagged_comment2 = pos_tag(postquery)
    for word, tag in tagged_comment2:
        if word in AttrList:
            if (word == 'live'):
                word = 'city'
            if (word == 'female' or word == 'girls' or word == 'girl'
                    or word == 'male' or word == 'boys' or word == 'boy'):
                word = 'gender'
            if (word == 'enrolled' or word == 'admission'
                    or word == 'admitted'):
                word = 'registration_date'
            if (word == 'studying' or word == 'studies' or word == 'study'):
                word = 'class'
            if (word == 'roll' or word == 'serial' or word == 'no.'):
                word = 'roll_no'

            post_cond_query.append(lemmatizer.lemmatize((word)))
            if (word == 'age'):
                for word, tag in tagged_comment:
                    if (word == 'less' or word == 'under'):
                        post_cond_query.append('<')
                    if (word == 'more' or word == 'over' or word == 'above'):
                        post_cond_query.append('>')
                    if (tag == 'CD'):
                        post_cond_query.append(word)
                if ('>' not in post_cond_query and '<' not in post_cond_query):
                    post_cond_query.insert(-1, '=')

            if (word == 'roll_no'):
                for word, tag in tagged_comment:
                    if (word == 'less' or word == 'under'):
                        post_cond_query.append('<')
                    if (word == 'more' or word == 'over' or word == 'above'):
                        post_cond_query.append('>')
                    if (tag == 'CD'):
                        post_cond_query.append(word)
                if ('>' not in post_cond_query and '<' not in post_cond_query):
                    post_cond_query.insert(-1, '=')

            if (word == 'city'):
                post_cond_query.extend(['='])
                if (len(place.cities) == 0):
                    post_cond_query.append("'" + token_comment[-1] + "'")
                else:
                    post_cond_query.append(str(place.cities).strip('[]'))

            if (word == 'gender'):
                post_cond_query.extend(['='])
                if ('girls' in token_comment or 'girl' in token_comment
                        or 'female' in token_comment):
                    post_cond_query.append("'female'")
                else:
                    post_cond_query.append("'male'")
            if (word == 'name'):
                post_cond_query.append("=")
                post_cond_query.append("'" + token_comment[-1] + "'")

            if (word == 'registration_date'):
                post_cond_query.extend(['='])
                for word, tag in tagged_comment:
                    if (tag == 'CD'):
                        post_cond_query.append("'" + word + "'")
    print("PostConditional Query", post_cond_query)

    cond_query = pre_cond_query + post_cond_query
    condi_query = ' '.join(map(str, cond_query))
    print('FINAL QUERY--> ' + condi_query)
    text1.insert(END, condi_query)
示例#32
0
def form_whereallquery(text):
    print("From all")
    ps = PorterStemmer()
    cond_query = []
    select_query = []
    comment = [text.lower()]
    place = GeoText(text)
    for i in range(0, len(comment)):
        token_comment = word_tokenize(comment[i])
        tagged_comment = pos_tag(token_comment)
        for word, tag in tagged_comment:
            if ((tag == 'NNP' or tag == 'NN' and word == 'show'
                 or word == "give" or word == 'describe' or word == 'state')
                    or word == 'show' or word == 'tell' or word == 'give'):
                select_query.append('Select')
                cond_query.append('Select')
                cond_query.append('*')
            if word in AttrList:
                if (word == 'live' or word == 'lives'):
                    word = 'city'
                if (word == 'female' or word == 'girls' or word == 'girl'
                        or word == 'male' or word == 'boys' or word == 'boy'):
                    word = 'gender'
                if (word == 'enrolled' or word == 'admission'):
                    word = 'registration_date'
                if (word == 'studying' or word == 'studies'
                        or word == 'study'):
                    word = 'class'
                if (word == 'roll' or word == 'serial' or word == 'no.'):
                    word = 'roll_no'

                if (word == 'age'):
                    cond_query.extend(['from', ' Student', 'where', word])
                    for word, tag in tagged_comment:
                        if (word == 'less' or word == 'under'):
                            cond_query.append('<')
                        if (word == 'more' or word == 'over'
                                or word == 'above'):
                            cond_query.append('>')
                        if (tag == 'CD'):
                            cond_query.append(word)
                    if ('>' not in cond_query and '<' not in cond_query):
                        cond_query.insert(-1, '=')

                if (word == 'city' or word == 'residence' or word == 'live'):
                    word = 'city'
                    cond_query.extend(['from', 'Student', 'where', word, '='])
                    if (len(place.cities) == 0):
                        cond_query.append("'" + token_comment[-1] + "'")
                    else:
                        cond_query.append(str(place.cities).strip('[]'))

                if (word == 'gender'):
                    cond_query.extend(['from', 'Student', 'where', word, '='])
                    if ('girls' in token_comment or 'girl' in token_comment
                            or 'female' in token_comment):
                        cond_query.append("'female'")
                    else:
                        cond_query.append("'male'")

                if (word == 'registration_date'):
                    for word, tag in tagged_comment:
                        if (tag == 'CD'):
                            cond_query.extend(
                                ['from', 'Student', 'where', word, '='])
                            cond_query.append("'" + word + "'")

                if (word == 'name'):
                    cond_query.extend(['from', 'Student', 'where', word, '='])
                    cond_query.append("'" + token_comment[-1] + "'")

                if (word == 'class'):
                    cond_query.extend(['from', 'Student', 'where', word, '='])
                    cond_query.append("'" + token_comment[-1] + "'")

                if (word == 'roll_no'):
                    cond_query.extend(['from', ' Student', 'where', word])
                    for word, tag in tagged_comment:
                        if (word == 'less' or word == 'under'):
                            cond_query.append('<')
                        if (word == 'more' or word == 'over'
                                or word == 'above'):
                            cond_query.append('>')
                        if (tag == 'CD'):
                            cond_query.append(word)
                    if ('>' not in cond_query and '<' not in cond_query):
                        cond_query.insert(-1, '=')

            if (cond_query[0] != 'Select'):
                cond_query.insert(0, 'Select')
    condi_query = ' '.join(map(str, cond_query))
    print('FINAL QUERY--> ' + condi_query + ';')
    text1.insert(END, condi_query)
示例#33
0
def form_countquery(text):
    print("From Count Query")
    ps = PorterStemmer()
    place = GeoText(text)
    cond_query = []
    select_query = []
    comment = [text.lower()]
    #print(text)
    for i in range(0, len(comment)):
        token_comment = word_tokenize(comment[i])
        #print(token_comment)
        tagged_comment = pos_tag(token_comment)
        #print('tagged comment', tagged_comment)
        #print([(word, tag) for word, tag in tagged_comment])

        cond_query.append('Select')
        cond_query.append('COUNT(*)')
        for word, tag in tagged_comment:
            if (word == 'elder' or word == 'older' or word == 'under'
                    or word == 'above' or word == 'below'):
                word = 'age'
            if word in AttrList:
                if (word == 'live' or word == 'lives'):
                    word = 'city'
                if (word == 'female' or word == 'girls' or word == 'girl'
                        or word == 'male' or word == 'boys' or word == 'boy'):
                    word = 'gender'
                if (word == 'enrolled' or word == 'admission'):
                    word = 'registration_date'
                if (word == 'studying' or word == 'studies' or word == 'study'
                        or word == 'class'):
                    word = 'class'
                    #cond_query.append(lemmatizer.lemmatize((word)))
                if (word == 'roll' or word == 'serial' or word == 'no.'):
                    word = 'roll_no'
                if (word == 'age'):
                    cond_query.extend(['from', ' Student', 'where', word])
                    for word, tag in tagged_comment:
                        if (word == 'less' or word == 'under'):
                            cond_query.append('<')
                        if (word == 'more' or word == 'over' or word == 'above'
                                or word == 'older'):
                            cond_query.append('>')
                        if (tag == 'CD'):
                            cond_query.append(word)
                    if ('>' not in cond_query and '<' not in cond_query):
                        cond_query.insert(-1, '=')

            if (word == 'city' or word == 'residence' or word == 'live'):
                word = 'city'
                cond_query.extend(['from', 'Student', 'where', word, '='])
                if (len(place.cities) == 0):
                    cond_query.append("'" + token_comment[-1] + "'")
                else:
                    cond_query.append(str(place.cities).strip('[]'))

            if (word == 'gender'):
                cond_query.extend(['from', 'Student', 'where', word, '='])
                if ('girls' in token_comment or 'girl' in token_comment
                        or 'female' in token_comment):
                    cond_query.append("'female'")
                else:
                    cond_query.append("'male'")

            if (word == 'registration_date'):
                for word, tag in tagged_comment:
                    if (tag == 'CD'):
                        cond_query.extend(
                            ['from', 'Student', 'where', word, '='])
                        cond_query.append("'" + word + "'")

            if (word == 'name'):
                cond_query.extend(['from', 'Student', 'where', word, '='])
                cond_query.append("'" + token_comment[-1] + "'")

            if (word == 'class'):
                cond_query.extend(['from', 'Student', 'where', word, '='])
                cond_query.append("'" + token_comment[-1] + "'")
            if (word == 'roll_no'):
                cond_query.extend(['from', ' Student', 'where', word])
                for word, tag in tagged_comment:
                    if (word == 'less' or word == 'under'):
                        cond_query.append('<')
                    if (word == 'more' or word == 'over' or word == 'above'):
                        cond_query.append('>')
                    if (tag == 'CD'):
                        cond_query.append(word)

            if (cond_query[0] != 'Select'):
                cond_query.insert(0, 'Select')
    condi_query = ' '.join(map(str, cond_query))
    if (len(condi_query) == 15):
        condi_query = condi_query + ' from Student;'
    print('FINAL QUERY--> ' + condi_query + ';')
    text1.insert(END, condi_query)
示例#34
0
"""
Name: Karan pankaj Makhija and Jeet Thakur
Version: Python 2.7
Title : Multiclass_Classification with Feature Engineering
"""
from sklearn import preprocessing, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import ensemble
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn import metrics
from nltk.stem import PorterStemmer

#Finding the stem of the words
ps = PorterStemmer()
my_stopwords = stopwords.words('english')
data = pd.read_json('Final1.json')

#Converting to lower case
data.iloc[:, 1] = data.iloc[:, 1].apply(
    lambda x: " ".join(x.lower() for x in x.split()))
#Removing Punctuation
data.iloc[:, 1] = data.iloc[:, 1].str.replace('[^\w\s]', '')
#Removing StopWords
data.iloc[:, 1] = data.iloc[:, 1].apply(
    lambda x: " ".join(ps.stem(x) for x in x.split()))
data.iloc[:, 1] = data.iloc[:, 1].apply(
    lambda x: " ".join(x for x in x.split() if x not in my_stopwords))

X = data.iloc[:, 1].values
    print(path + "/" + name)
    next = path + "/" + name

    nextnameL = [re.findall(r'[a-z]+', name)[0]]
    nextname = nextnameL[0]  ## Keep just the name

    ListOfCompleteFilePaths.append(next)
    ListOfJustFileNames.append(nextname)

# In[2]:

####################################################
##  Create the Stemmer Function.........
######################################################
## Instantiate it
A_STEMMER = PorterStemmer()


#----------------------------------------
# Use NLTK's PorterStemmer in a function - DEFINE THE FUNCTION
#-------------------------------------------------------
def MY_STEMMER(str_input):
    ## Only use letters, no punct, no nums, make lowercase...
    words = re.sub(r"[^A-Za-z\-]", " ", str_input).lower().split()
    words = [A_STEMMER.stem(word) for word in words]  ## Use the Stemmer...
    return words


##################################################################
## Tokenize and Vectorize the text data from the corpus...
##############################################################
示例#36
0
#!/usr/bin/env python
import textmining
import glob
from nltk.stem import PorterStemmer

tdm = textmining.TermDocumentMatrix()
ps = PorterStemmer()
files = glob.glob("*.txt")
print(files)
newcont = []
newst = ''
for f in files:
    content = open(f).read()
    content = content.replace('\n', ' ')
    c = content.split()
    for i in c:
        newcont.append(ps.stem(i))
    newst = " ".join(newcont)
    tdm.add_doc(newst)
tdm.write_csv('matrix.csv', cutoff=1)
#!/usr/bin/env python
'''
  Description : This Script will use the simple Stemming 
                technique from Python List
'''
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer

#Stemming is a technique for removing affixes from a word
my_list = ['going', 'cars', 'went', 'came', 'coming']
print "*****************************"
print "The Porter Stemmer Algorithm"
print "*****************************"
stemmer = PorterStemmer()
for i in my_list:
    print "%s => %s " % (i, stemmer.stem(i))
print

print "*******************************"
print "The Lancaster Stemmer Algorithm"
print "*******************************"
stemmer = LancasterStemmer()
for i in my_list:
    print "%s => %s " % (i, stemmer.stem(i))
print

print "*****************************"
print "    Wornet Lemmatization"
print "*****************************"
lem = WordNetLemmatizer()
示例#38
0
class MounicaSelectorPhrasalTEMP:
    def __init__(self,
                 ngram,
                 google_freq_file=RESOURCES['en']['nrr']['google'],
                 cutoff=0):
        google_freq = {}
        total = 0
        nextone = 0
        logger.debug("Loading google %d-gram frequencies..." % ngram)
        for line in open(google_freq_file, encoding='utf-8'):
            line_tokens = [t.strip() for t in line.strip().split('\t')]
            try:
                count = int(line_tokens[1])
                if count > cutoff:
                    google_freq[line_tokens[0]] = np.log10(count)
                    total += 1
                    nextone = 0
            except IndexError:
                logger.debug(
                    "Error: the following has no corresponding word: " +
                    str(line_tokens))
                pass
            if (total % 1000 == 0 and nextone == 0):
                nextone = 1
                logger.debug("N-gram count: " + str(total))
        logger.info("Total n-grams loaded: " + str(total))
        self.ngram = ngram
        self.google_freq = google_freq
        self.ps = PorterStemmer()
        self.lem = nltk.WordNetLemmatizer()

    def select(self, sent, so, eo, candidates):
        cand = list(candidates)
        scores = self.get_scores(sent, so, eo, cand)
        out = []
        for i in range(0, len(scores)):
            if (scores[i] != 0):
                out.append(cand[i])

        # This can filter out wrong tenses & duplicates before OR after ngram comparison
        out = self.filter_out_tense(sent, so, eo, out)

        return out

    def get_scores(self, sent, so, eo, candidates):
        t_b = word_tokenize(sent[:so])
        t_a = word_tokenize(sent[eo:])

        if len(t_b) < self.ngram - 1:
            t_b = ['<S>'] + t_b

        if len(t_a) < self.ngram - 1:
            t_a = t_a + ['</S>']

        scores = []
        for word in candidates:
            combos = t_b[-self.ngram + 1:] + [word] + t_a[:self.ngram - 1]
            scores.append(0)
            for j in range(0, len(combos) - self.ngram + 1):
                phrase = ''
                for word in combos[j:j + self.ngram]:
                    phrase += word + ' '
                phrase = phrase.lower()
                if phrase[:-1] in self.google_freq:
                    scores[-1] += self.google_freq[phrase[:-1]]
        return scores

    def filter_out_tense(self, sent, so, eo, candidates):
        stems = []
        out = []
        word_tag = nltk.pos_tag([sent[so:eo]])[0][1]
        stems.append(self.ps.stem(sent[so:eo]))
        for word in candidates:
            cand_stem = self.ps.stem(word)
            if cand_stem not in stems:
                stems.append(cand_stem)
                try:
                    cand_tag = self.tag_for_lemmatizer(word)
                    if cand_tag is None:
                        out.append(
                            getInflection(self.lem.lemmatize(word,
                                                             pos=cand_tag),
                                          tag=word_tag)[0])
                    else:
                        out.append(word)
                except IndexError:
                    # Lemminflect does not support all POS tags - lemminflect.readthedocs.io/en/latest/tags/
                    out.append(word)
                    logger.debug(
                        "ERROR: Lemminflect cannot convert {} with type {}, skipping"
                        .format(word, word_tag))
        return out

    def tag_for_lemmatizer(self, word):
        tag = nltk.pos_tag([word])[0][1][:2]
        if tag in ['VB']:
            return 'v'
        elif tag in ['JJ']:
            return 'a'
        elif tag in ['RB']:
            return 'r'
        else:
            return 'n'
示例#39
0
 def __init__(self):
     self.stemmer = NltkPorterStemmer()
示例#40
0
                #emoji = resolveEmoji(text)
            lang = payload['lang']
            tstamp = time.mktime(tstmp)

            return {'id':int(id), 'words':words, 'hashtags':hashtags, 'checkins':checkins, 'mentions':mentions, \
                    'ctype':q, 'crds':crds, 'timestamp':tstamp, 'lang':lang, 'urls':urls+media_urls}

        except Exception as e:
            print 'could not process object', obj['_id'], e
            return None
    else:
        return None


def batch_process(workload):
    prc = [process(d) for d in workload]
    return prc


def build_tdelta(d=0, h=0):
    return (datetime.now() - timedelta(days=d, hours=h))


tok_regex = re.compile(r'(?u)[@|#]?\w+')
punkt_regex = re.compile('[%s]' % re.escape(string.punctuation))
url_regex = re.compile(
    r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
)
swords = load_stop_words()
lm = PorterStemmer()
示例#41
0
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle

#Reading the Dataset
df = pd.read_csv('train.csv')

#Dropping the rows of null values
df = df.dropna()

messages = df.copy()
messages.reset_index(inplace=True)

ps = PorterStemmer()
lemma = WordNetLemmatizer()

corpus = []
for i in range(0, len(messages)):
    review = (re.sub('[^a-zA-Z]', ' ', messages['title'][i])).lower()
    review = review.split()

    review = [
        lemma.lemmatize(word, pos='n') for word in review
        if word not in stopwords.words('english')
    ]
    review = ' '.join(review)
    corpus.append(review)

#Extract features with CountVectorizer
def proccess_tweet(df,
                   domain_terms,
                   col='text',
                   removeSlang=True,
                   spellCorrection=False):
    '''
        Function recieves
            - a dataframe with 'text' column to be tokenize
            - a set of domain saved terms to be skipped in the stop-removal (and slang) and stemming phase
        The function apply stemming, stop-words removal, slang correction and tokenization
        Spell correction is currently commented due to low performance 
    '''
    # Initialize a ProterStemmer object
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    # Create a set of stop words, countries names, negation words and slang words (if needed) -> saved_words
    # These words won't be stemmed, due to uses of these words in later phases
    saved_words = get_saved_words(removeSlang)

    # Tokenization, stemming and stopword (and punctuation) removal - the result is a list of the terms of the tweet.
    # The '#' removed in the strip_punctuation function. Converted to normal word.

    # Convert text to lower case
    df['tokenized_text'] = df[col].apply(lambda text: text.lower())
    print('  - Lower case completed')
    df['tokenized_text'] = df['tokenized_text'].apply(
        lambda x: x.replace('…', ''))
    df['tokenized_text'] = df['tokenized_text'].apply(
        lambda x: x.replace('"', ''))
    df['tokenized_text'] = df['tokenized_text'].apply(
        lambda x: x.replace('\n', ''))

    # Drop links and mentions
    # word_tokenizer(tweet) was replaced by .split(). Reason: probably done some extra pre-proccessing that cause damage
    df['tokenized_text'] = df['tokenized_text'].apply(lambda tweet: [
        strip_punctuation(token) for token in tweet.split()
        if ((token not in stop_words) and ('http' not in token) and
            ('pic.twitter' not in token) and ('bitly.' not in token) and
            (token != 'rt') and (token != '…') and (token != '"') and
            ('bit.ly' not in token) and (not token.startswith('@')))
    ])

    print('  - Strip punctuation completed')
    print('  - Stop-words removal completed')
    print('  - Tokenization completed')

    if removeSlang:
        # Replace slang words
        slang_dict = slang.slang_words()
        df['tokenized_text'] = df['tokenized_text'].apply(lambda tweet: [
            slang_dict[token] if token in slang_dict else token
            for token in tweet
        ])
        print('  - Slang words correction completed')

    if spellCorrection:
        # Spell correction for tokens, unless they are domain terms
        df['tokenized_text'] = df['tokenized_text_no_spell'].apply(
            lambda tokens: [
                str(TextBlob(token).correct())
                if token not in domain_terms else token for token in tokens
            ])
        # spelling correction feature
        df['num_spell_errors'] = df[[
            'tokenized_text_no_spell', 'tokenized_text'
        ]].apply(lambda x: spell_correction(x[0], x[1]))
        print('  - Spell correction completed')

    # Before using Porter stemmer - use domain stemmer and stem words that Porter stems badly.
    domainStemmer = domain_stemmer()
    # Stem it to words that are in domain_terms, so Porter stemmer will skip these words
    df['tokenized_text'] = df['tokenized_text'].apply(lambda tokens: [
        domainStemmer[token] if token in domainStemmer else token
        for token in tokens
    ])

    # Stem tokens, unless they are domain terms (or length = 1) or saved words
    df['tokenized_text'] = df['tokenized_text'].apply(lambda tokens: [
        stemmer.stem(token) if (token not in domain_terms and token not in
                                saved_words and len(token) > 1) else token
        for token in tokens
    ])
    print('  - Stemming phase completed')

    # Remove empty tokens
    df['tokenized_text'] = df['tokenized_text'].apply(
        lambda tokens: [token for token in tokens if len(token) > 1])
    print('  - Cleaning empty tokens completed')

    print('Final number of features: {}'.format(df.shape[0]))
    print('\nCOMPLETED: Pre-proccess')
    print('----------------------\n----------------------')

    return df
示例#43
0
filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

filtered_sentence

filtered_sentence = [w for w in word_tokens if not w in stop_words]

print(word_tokens)
print(filtered_sentence)

from nltk.stem import PorterStemmer
ps = PorterStemmer()
example_words = ["shop","shopping","shops"]
for w in example_words:
    print(ps.stem(w))

new_text = "Raindrops are the size of bullets thundered on the castle windoes for days on end; the lake rose, the flower beds turned into muddy streams, and Hagrid's pumpkins swelled to the size of garden sheds."

words = word_tokenize(new_text)
for w in words:
    print(ps.stem(w))

from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 13 20:46:26 2018

an example of stemming e.g. writing becomes write (the stem)
Basically normalising sentences

Using 

@author: jay
"""
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

words = ["python", "pythoner", "pythoning", "pythonly"]

for w in words:
    print(w, ' = ', ps.stem(w))
def preprocess(df):
    '''
    Preprocess text from dataframe. Splits reviews into sentences, then each sentence is preprocessed. 
    Returns dataframe with original and preprocessed text.
    '''

    # Split into sentences 
    nlp = spacy.load('en_core_web_sm')
    review_ids = []
    text = []

    # Assuming the reviews aren't split and that reviews columns are 'review',
    # for review_id, review in zip(df['review_id'], df['review']):
    #     sentences= [i for i in nlp(review).sents]
    #     for sentence in sentences:
    #         review_ids.append(review_id)
    #         text.append(str(sentence))
    
    # No review IDs
    for review in df['review']:
        sentences= [i for i in nlp(review).sents]
        for sentence in sentences:
            # review_ids.append(review_id)
            text.append(str(sentence))

    reviews_df = pd.DataFrame()
    # reviews_df['review_id'] = review_ids
    reviews_df['raw_text'] = text

    # Remove symbols,punctuations...
    reviews_df['clean_text'] = reviews_df['raw_text'].str.replace('[^\w\s]','')
    reviews_df['clean_text'] = reviews_df['clean_text'].str.replace('\d+', '')
    reviews_df['clean_text'] = reviews_df['clean_text'].str.lower()
    reviews_df['clean_text'] = reviews_df['clean_text'].str.replace('^https?:\/\/.*[\r\n]*', '')

    reviews_df['clean_text'].replace('', np.nan, inplace=True)
    drop = reviews_df[pd.isnull(reviews_df['clean_text'])].index
    reviews_df.drop(drop , inplace=True)
    reviews_df = reviews_df.reset_index(drop = True) 
   

    
    def preprocess_aspect(df):
        '''
        Preprocessing text for aspect extraction and classification.
        Returns tf-idf/corpus, LDA model.
        '''
        
        def sent_to_words(sentences):
            for sentence in sentences:
                yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

        data_words = list(sent_to_words(df['clean_text']))

        # Build the bigram and trigram models
        bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
        trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

        # Faster way to get a sentence clubbed as a trigram/bigram
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        trigram_mod = gensim.models.phrases.Phraser(trigram)

        stop_words = stopwords.words('english')
        stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

        # Define functions for stopwords, bigrams, trigrams and lemmatization
        def remove_stopwords(texts):
            return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

        def make_bigrams(texts):
            return [bigram_mod[doc] for doc in texts]

        def make_trigrams(texts):
            return [trigram_mod[bigram_mod[doc]] for doc in texts]

        def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
            """https://spacy.io/api/annotation"""
            texts_out = []
            for sent in texts:
                doc = nlp(" ".join(sent)) 
                texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
            return texts_out
        
        # Remove Stop Words
        data_words_nostops = remove_stopwords(data_words)

        # Form Bigrams
        data_words_bigrams = make_bigrams(data_words_nostops)

        # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
        # python3 -m spacy download en
        nlp = spacy.load('en', disable=['parser', 'ner'])

        # Do lemmatization keeping only noun, adj, vb, adv
        data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

        # Create Dictionary
        id2word = corpora.Dictionary(data_lemmatized)

        # Create Corpus
        texts = data_lemmatized

        # Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]

        return corpus, id2word


    porter = PorterStemmer()

    def stemSentence(sentence):
        token_words=word_tokenize(sentence)
        token_words
        stem_sentence=[]
        for word in token_words:
            stem_sentence.append(porter.stem(word))
            stem_sentence.append(" ")
        return "".join(stem_sentence)

    stemmed = []
    for sentence in reviews_df['clean_text']:
        stemmed.append(stemSentence(sentence))


    reviews_df['stem_text'] = stemmed
    
    # corpus, id2word = preprocess_aspect(reviews_df)

    # Remove stop words
    # stop_words = set(stopwords.words('english'))
    # reviews_df['no_sw'] = reviews_df['clean_text'][:].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

    return reviews_df #, corpus, id2word
示例#46
0
from math import log
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import codecs
import string
import math
from collections import defaultdict
''' 詞形還原(Lemmatization)
例如:dogs -> dog, cats -> cat
doing -> do, done -> do
better -> good'''
wnl = WordNetLemmatizer()
stemmer = PorterStemmer()

#f = open("dict_RTS.txt","w", encoding = 'utf-8' )
#f.write( str(RTS))
#f.close()

# punctuation = (全部)標點符號
stop_words = stopwords.words('english') + list(punctuation)
DN = float(len(reuters.fileids()))
# type=dict,_RTS = 路透社資料字典
RTS = {
    k: [
        wnl.lemmatize(w.lower()) for w in reuters.words(k)
        if w not in stop_words and not w.isdigit() and len(w.strip()) > 2
    ]
    for k in reuters.fileids()
# Author            : Sujay <*****@*****.**>
# Last Modified By  : Sujay <*****@*****.**>

from math import log as l
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bccDataLoad_for_density import *
import matplotlib.pyplot as plt

#extra to be removed
import operator

stop_words = stopwords.words('english') + ['said' + 'v' + 'would' + 'could']
ps = PorterStemmer()
business_file = entertainment_file = politics_file = sport_file = tech_file = []


#get the data
def getFreqDist(data):
    global tech_file, business_file, sport_file, entertainment_file, politics_file
    if data == "business":
        data = [
            ps.stem(w) for w in word_tokenize(getBusiness())
            if w not in stop_words
        ]
        business_file = set(data)
    elif data == "entertainment":
        data = [
            ps.stem(w) for w in word_tokenize(getEntertainment())
# reading input.txt file
text = open('input.txt', encoding="utf8").read()

# Tokenization
wtokens = word_tokenize(text)
stokens = sent_tokenize(text)
print("\n Word tokens:", wtokens)
print("\n Sentence tokens:", stokens)

# POS
pos = nltk.pos_tag(wtokens)
print("\n Parts of Speech:", pos)

# Stemming
pstem = PorterStemmer()
lstem = LancasterStemmer()
sstem = SnowballStemmer("english")
pstem_output = ' '.join([pstem.stem(w) for w in wtokens])
lstem_output = ' '.join([lstem.stem(w) for w in wtokens])
sstem_output = ' '.join([sstem.stem(w) for w in wtokens])
print("\n PorterStemmer Stemming:\n", pstem_output)
print("\n LancasterStemmer Stemming:\n", lstem_output)
print("\n SnowballStemmer Stemming:\n", sstem_output)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lem_output = ' '.join([lemmatizer.lemmatize(w) for w in wtokens])
print("\n Lemmatization: \n", lem_output)

# Trigram
示例#49
0
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import string

totneg = 0
totpos = 0

porter = PorterStemmer()
lancaster = LancasterStemmer()

stopfile = open("stopwords.txt", 'r')
stopwords = stopfile.read()
stopwords = stopwords.split()

negfile = open("negative-words.txt", 'r', encoding="ISO-8859-1")
negwords = negfile.read()
negwords = negwords.split()

posfile = open("positive-words.txt", 'r', encoding="ISO-8859-1")
poswords = posfile.read()
poswords = poswords.split()

#comment ="it sends you away a believer again and quite cheered at just that"

exclude = set(string.punctuation)
comment = ''.join(ch for ch in comment if ch not in exclude)
comment = comment.split()

x = ' '.join(j for j in comment if j not in stopwords)
x = x.split()
    def stream(self, records):
        if self.custom_stopwords:
            custom_stopwords = self.custom_stopwords.replace(' ','').split(',')
        for record in records:
            if self.keep_orig:
                record['orig_text'] = record[self.textfield]
            #URL removal
            if self.remove_urls:
                record[self.textfield] = self.f_remove_urls(
                    record[self.textfield]
                )
            #Tokenization
            if (self.base_word and self.base_type == 'lemma_pos') or self.force_nltk_tokenize:
                #lemma_pos - if option is lemmatization with POS tagging do cleaning and stopword options now
                if (self.base_word and self.base_type == 'lemma_pos'):
                    record['pos_tuple'] = pos_tag(
                        word_tokenize(
                            record[self.textfield]
                        ),
                        tagset=self.pos_tagset
                    )
                    if self.default_clean and self.remove_stopwords:
                        if self.custom_stopwords:
                            stopwords = set(stop_words.words('english') + custom_stopwords)
                        else:
                            stopwords = set(stop_words.words('english'))
                        record['pos_tuple'] = [
                            [
                            re.sub(r'[\W\d]','',text[0]).lower(),
                            text[1]
                            ]
                            for text in
                            record['pos_tuple']
                            if re.sub(r'[\W\d]','',text[0]).lower() not in stopwords
                            and not re.search(r'[\W]',text[0])
                        ]
                    elif self.default_clean and not self.remove_stopwords:
                        record['pos_tuple'] = [
                            [
                            re.sub(r'[\W\d]','',text[0]).lower(),
                            text[1]
                            ]
                            for text in
                            record['pos_tuple']
                            if not re.search(r'[\W]',text[0])
                        ]
                elif self.force_nltk_tokenize:
                    record[self.textfield] = word_tokenize(
                        record[self.textfield]
                    )
            elif self.default_clean or (self.base_word and self.base_type == 'lemma'):
                #https://stackoverflow.com/a/1059601
                record[self.textfield] = re.split('\W+', record[self.textfield])
            else:
                record[self.textfield] = record[self.textfield].split()
            #Default Clean
            if self.default_clean and not self.base_type == 'lemma_pos':
                record[self.textfield] = [
                    re.sub(r'[\W\d]','',text).lower()
                    for text in
                    record[self.textfield]
                ]
            #Lemmatization with POS tagging
            if self.base_word and self.base_type == 'lemma_pos':
                    lm = WordNetLemmatizer()
                    tuple_list = []
                    tag_list = []
                    record[self.textfield] = []
                    record['pos_tag'] = []
                    for text in record['pos_tuple']:
                        keep_text = lm.lemmatize(
                                text[0],
                                self.get_wordnet_pos(text[1])
                            )
                        if keep_text:
                            record[self.textfield].append(keep_text)
                            tuple_list.append([keep_text,text[1]])
                            tag_list.append(text[1])
                            record['pos_tag'] = tag_list
                            record['pos_tuple'] = tuple_list
            #Lemmatization or Stemming with stopword removal
            if self.remove_stopwords and self.base_word and self.base_type != 'lemma_pos':
                if self.custom_stopwords:
                    stopwords = set(stop_words.words('english') + custom_stopwords)
                else:
                    stopwords = set(stop_words.words('english'))
                if self.base_type == 'lemma':
                    lm = WordNetLemmatizer()
                    record[self.textfield] = [
                        lm.lemmatize(text)
                        for text in
                        record[self.textfield]
                        if text not in stopwords
                    ]
                if self.base_type == 'stem':
                    ps = PorterStemmer()
                    record[self.textfield] = [
                        ps.stem(text)
                        for text in
                        record[self.textfield]
                        if text not in stopwords
                    ]
            #Lemmatization or Stemming without stopword removal
            if not self.remove_stopwords and self.base_word:
                if self.base_type == 'lemma':
                    lm = WordNetLemmatizer()
                    record[self.textfield] = [
                        lm.lemmatize(text)
                        for text in
                        record[self.textfield]
                    ]
                if self.base_type == 'stem':
                    ps = PorterStemmer()
                    record[self.textfield] = [
                        ps.stem(text)
                        for text in
                        record[self.textfield]
                    ]
            #Stopword Removal without Lemmatization or Stemming
            if self.remove_stopwords and not self.base_word:
                if self.custom_stopwords:
                    stopwords = set(stop_words.words('english') + custom_stopwords)
                else:
                    stopwords = set(stop_words.words('english'))
                record[self.textfield] = [
                    text 
                    for text in
                    record[self.textfield]
                    if text not in stopwords
                    ]
            #Minimum term length
            if self.term_min_len > 0:
                record[self.textfield] = [
                     i
                     for i in record[self.textfield]
                     if len(i) >= self.term_min_len
                     ]
            #ngram column creation
            (min_n,max_n) = self.ngram_range.split('-')
            #if max_n > 1 and max_n >= min_n:
            if int(max_n) > 1 and int(max_n) >= int(min_n):
                max_n = int(max_n) + 1
                ngram_extract = self.ngram(
                    [_f for _f in record[self.textfield] if _f],
                    int(min_n),
                    max_n
                )
                if ngram_extract:
                    for i in ngram_extract:
                        if not self.ngram_mix:
                            if 'ngrams_' + str(i[0]) not in record:
                                record['ngrams_' + str(i[0])] = []
                            record['ngrams_' + str(i[0])].append(i[1])
                        else:
                            if 'ngrams' not in record:
                                record['ngrams'] = []
                            record['ngrams'].append(i[1])
                else:
                    if not self.ngram_mix:
                        for n in list(range(int(min_n),int(max_n))):
                            if n!=1:
                                record['ngrams_' + str(n)] = []
                    else:
                        if 'ngrams' not in record:
                            record['ngrams'] = []
            #Final Multi-Value Output
            if not self.mv:
                record[self.textfield] = ' '.join(record[self.textfield])
                try:
                    record['pos_tag'] = ' '.join(record['pos_tag'])
                except:
                    pass

            yield record
示例#51
0
print("*****************Sentence Tokenizer*******************")
for s in senttokens:
    print(s)
print("*****************Word Tokenizer*******************")
for w in wordtokens:
    print(w)

print("*****************POS Tagging*******************")
print(nltk.pos_tag(wordtokens))

print("*****************Stemming*******************")
ps = []
ls = []
ss = []
print("*****************Porter Stemming*******************")
pStemmer = PorterStemmer()
for w in wordtokens:
    ps.append(pStemmer.stem(w))
print(ps)
print("*****************Lancaster Stemming*******************")
lStemmer = LancasterStemmer()
for w in wordtokens:
    ls.append(lStemmer.stem(w))
print(ls)
print("*****************Snowball Stemming*******************")
sStemmer = SnowballStemmer('english')
for w in wordtokens:
    ss.append(sStemmer.stem(w))
print(ss)

print("*****************Lemmatization*******************")
示例#52
0
def match(query, method, instance_url):
    '''
    search solr index using user query
    '''
    q_list = []
    q_string = ''
    solr = pysolr.Solr(instance_url)

    tokens = word_tokenize(query)
    stemmer = PorterStemmer()
    tagged_tokens = pos_tag(tokens)
    tagged_list = [tuple2str(t) for t in tagged_tokens]
    tokens_clean = deleteStopWords(tokens)
    lemmas = get_lemmatized_line(tagged_tokens)
    stem_line = [stemmer.stem(t) for t in tokens_clean]
    synonyms, hypernyms, hyponyms, meronyms, holonyms = get_semantic_features(
        tagged_tokens, tokens)
    #        head_words = get_dependency_relations(lemmas, True)
    print('user input features', ' ===> \n', 'text: \n', query, '\n',
          'tokens: \n', tokens, '\n', 'pos tag: \n', tagged_tokens, '\n',
          'remove_stopWords: \n', tokens_clean, '\n', 'lemmatized: \n', lemmas,
          '\n', 'stemmed: \n', stem_line, '\n', 'synonyms: \n', synonyms, '\n',
          'hypernyms: \n', hypernyms, '\n', 'hyponyms: \n', hyponyms, '\n',
          'meronyms: \n', meronyms, '\n', 'holonymns: \n', holonyms, '\n\n')

    pos_tag_data = '&'.join(tagged_list)
    lemmas = '&'.join(lemmas.split())
    stems = '&'.join(stem_line)
    synonyms = '&'.join(synonyms.split())
    hypernyms = '&'.join(hypernyms.split())
    hyponyms = '&'.join(hyponyms.split())
    holonyms = '&'.join(holonyms.split())
    meronyms = '&'.join(meronyms.split())

    if method == 3:
        #        head_words = '&'.join(head_words.split())
        if tokens:
            q_list.append('text:' + '&'.join(tokens))
        if tokens_clean:
            q_list.append('text_clean:' + '&'.join(tokens_clean))
        if pos_tag_data:
            q_list.append('pos_tag:' + pos_tag_data)
        if lemmas:
            q_list.append('lemmas:' + lemmas)
        if stems:
            q_list.append('stems:' + stems)
        if synonyms:
            q_list.append('synonyms:' + synonyms)
        if hypernyms:
            q_list.append('hypernyms:' + hypernyms)
        if hyponyms:
            q_list.append('hyponyms:' + hyponyms)
        if meronyms:
            q_list.append('meronyms:' + meronyms)
        if holonyms:
            q_list.append('holonymns:' + holonyms)
#            if head_words:
#                q_list.append('head_word:' + head_words)
    if method == 4:
        if tokens:
            q_list.append('text:' + '&'.join(tokens) + '^0.5')
        if pos_tag_data:
            q_list.append('pos_tag:' + pos_tag_data + '^0.02')
        if lemmas:
            q_list.append('lemmas:' + lemmas + '^4')
        if tokens_clean:
            q_list.append('text_clean:' + '&'.join(tokens_clean) + '^5')
        if stems:
            q_list.append('stems:' + stems + '^1.5')
        if synonyms:
            q_list.append('synonyms:' + synonyms + '5')
        if hypernyms:
            q_list.append('hypernyms:' + hypernyms + '^5')


#       if hyponyms:
#                 q_list.append('hyponyms:' + hyponyms + '^4')
#         if head_words:
#                q_list.append('head_word:' + head_words + '^0.5')
#         if meronyms:
#                 q_list.append('meronyms:' + meronyms + '^1.4')
#         if holonyms:
#                 q_list.append('holonymns:' + holonyms + '^1.4')

    q_string = ', '.join(q_list)
    print('The Solr query is q=%s, fl=\'id,text\'\n' % (q_string))
    result = solr.search(q=q_string, fl='id,text')
    for r in result:
        print(r['id'])
        print(' '.join(r['text']))
示例#53
0
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stemmer = PorterStemmer()
stop_words = stopwords.words('english')

inp = input("Enter String: ")

tokens = word_tokenize(inp)
clean_tokens = tokens = tokens[:]

for token in tokens:
    if token in stop_words: clean_tokens.remove(token)

table = str.maketrans('', '', string.punctuation)
clean_tokens = [word.translate(table) for word in tokens]

all_synonyms = []

for word in clean_tokens:
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            all_synonyms.append(lemma.name())
import os
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import seaborn as sns


my_lemmatizer = WordNetLemmatizer()
vector = CountVectorizer()
stemming = PorterStemmer()
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

#Getting the stopwords corpora
stopwords_list = stopwords.words('english')
print(stopwords_list[:5])
#Loading the final merged dataset
youtube_train = pd.read_csv("C:/Users/prate/Desktop/ICT_solution/Data/final_data/final_data.csv",delimiter=',')
youtube_train_sen = youtube_train['video_title']
print(youtube_train_sen[1])
#Creating an empty array
final_sent = [];
#Initializing variables
count = 0
示例#55
0
#Transform label to binary
#data.label = data.label.map(dict(REAL=1, FAKE=0))

# Visualising the dataset
import seaborn as sb


def create_distribution(dataFile):
    return sb.countplot(x='Label', data=dataFile, palette='hls')


#create_distribution(data)
data = shuffle(data)

# Clean the text
eng_stemmer2 = PorterStemmer()
eng_stemmer = SnowballStemmer('english')
#nltk.corpus.stopwords.words('english').remove('not')
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.remove('not')


def stem_tokens(tokens, stemmer):
    stemmed = []
    for token in tokens:
        stemmed.append(stemmer.stem(token))
    return stemmed


#tokens = data.iloc[3, 0]
def process_data(tokens):
示例#56
0
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

file_index = open('label/index', encoding='utf-8')
stop_words = set(stopwords.words('english'))  # get stop word set
columns = ['label', 'word_count', 'text']
del_table = string.punctuation
str_table = str.maketrans('', '', '1234567890')

dict1 = {}
count = 0
file = open('output.txt', 'w')
spam_word = []

ps = PorterStemmer()
df = pd.DataFrame(columns=columns)

for line in file_index.readlines():
    tmp = line.split(' ')
    tmp_label = 1 if tmp[0] == 'spam' else 0  # mail label, spam -> 1, ham -> 0
    tmp_path = tmp[1]  # mail's path
    tmp_path = tmp_path.replace('\n', '')  # remove '\n' in path
    tmp_path = tmp_path.replace('../', '')  # remove '../' in path
    if tmp_label == 1:
        try:
            count += 1
            print(count)

            mail_file = open(tmp_path, encoding='utf-8')
            mail_text = mail_file.read()
    else:
        contents[fileNum] = contents.get(fileNum) + content

    #print contents[fileNum]
print len(contents), '\n', fileNum

# In[3]:

doc = open('classification.txt', 'rb')
classification2_text = doc.readlines()

# In[4]:

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()

# head of HTML file
f = open('HTML/search_keywords.html', 'w')
f.write("<html>\n")
f.write("<head> Search Keywords\n")
f.write("    <script type=\"text/javascript\" src=\"mktree.js\"></script>\n")
f.write(
    "    <link rel=\"stylesheet\" href=\"mktree.css\" type=\"text/css\">\n")
f.write("</head>\n\n")
f.write("<body>\n")
f.write("<ul class=\"mktree\">\n")

has_subdir = False
has_record = False
示例#58
0
def preprocess_reviews(revs, word_process):
    try:
        stop_words = set(stopwords.words("english"))
        processed_revs = []
        ps = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        print(len(revs))

        for rev in revs:
            # Get the categories
            if rev == '':
                continue
            rev = rev.split('|~|')
            pro = rev[0]
            con = rev[1]
            adv = rev[2]
            score = rev[3]
            pre_post = rev[4]
            rev_id = rev[5]
            company_id = rev[6]
            industry = rev[7]
            comp_good_bad = rev[8]

            # Tokenize the words
            pro_words = word_tokenize(pro)
            con_words = word_tokenize(con)
            adv_words = word_tokenize(adv)

            if word_process == "raw":
                # Option 1: Filter the stopwords out and append the category
                filtered_pros = [
                    w.lower() for w in pro_words if not w in stop_words
                ]
                filtered_cons = [
                    w.lower() for w in con_words if not w in stop_words
                ]
                filtered_adv = [
                    w.lower() for w in adv_words if not w in stop_words
                ]
            elif word_process == "stem":
                # Option 2: Filter the stopwords out, stem the words, and append the category
                filtered_pros = [
                    ps.stem(w).lower() + ("_pro" if labels else "")
                    for w in pro_words if not w in stop_words
                ]
                filtered_cons = [
                    ps.stem(w).lower() + ("_con" if labels else "")
                    for w in con_words if not w in stop_words
                ]
                filtered_adv = [
                    ps.stem(w).lower() + ("_adv" if labels else "")
                    for w in adv_words if not w in stop_words
                ]
            elif word_process == "lemma":
                # Option 3: Filter the stopwords out, lemmatize the words, and append the category
                filtered_pros = [
                    lemmatizer.lemmatize(w).lower() +
                    ("_pro" if labels else "") for w in pro_words
                    if not w in stop_words
                ]
                filtered_cons = [
                    lemmatizer.lemmatize(w).lower() +
                    ("_con" if labels else "") for w in con_words
                    if not w in stop_words
                ]
                filtered_adv = [
                    lemmatizer.lemmatize(w).lower() +
                    ("_adv" if labels else "") for w in adv_words
                    if not w in stop_words
                ]
            elif word_process == "features":
                # Option 4: Only keep the words in the specified featureset
                filtered_pros = [
                    ps.stem(w).lower() + ("_pro" if labels else "")
                    for w in pro_words if w in all_features
                ]
                filtered_cons = [
                    ps.stem(w).lower() + ("_con" if labels else "")
                    for w in con_words if w in all_features
                ]
                filtered_adv = [
                    ps.stem(w).lower() + ("_adv" if labels else "")
                    for w in adv_words if w in all_features
                ]
            else:
                print(
                    "Invalid word processing type. Please enter \"raw\", \"stem\", or \"lemma\". Exiting program..."
                )
                exit()

            # Turn the filtered words back into a sentence and create a tuple of the review and the score
            rev = (' '.join(word
                            for word in (filtered_pros + filtered_cons +
                                         filtered_adv)), int(score), pre_post,
                   sampled, rev_id, company_id, industry, comp_good_bad)

            # Append to the processed list
            processed_revs.append(rev)

        return processed_revs

    except KeyboardInterrupt:
        print('exit')
        exit(1)
示例#59
0
# Expand contractions (can skip since they are removed in the next step)
doc = contractions.fix(doc).replace('  ', ' ')
print('\nExpanded contractions.')

# Tokenize
tokens = word_tokenize(doc)
print(f'\nTokenizing:\n{tokens[:40]}...')

# Remove stop words
stop_words = set(stopwords.words("english"))
tokens = [w for w in tokens if w not in stop_words]
print('\nRemoved stopwords.')

# Lemmatization
wordnet_lem = WordNetLemmatizer()
tokens_lem = [wordnet_lem.lemmatize(token) for token in tokens]

# Stemming (skip as word meaning is lost)
porter_stem = PorterStemmer()
tokens_stem = [porter_stem.stem(token) for token in tokens_lem]

print(f'\nAfter Lemmatization and removing Stop words:\n{tokens_lem[:40]}...')

# Word frequency
fdist = FreqDist(tokens_lem)
print(f'\nMost common words:\n{fdist}')
print(fdist.most_common(20))
fdist.plot(20, cumulative=False)

# END
示例#60
0
def stemming(tokens):
    ps = PorterStemmer()
    StemmedWords = [ps.stem(words) for words in tokens]
    return StemmedWords