def openfile(filename,output):
    print(filename)
    #starts run time
    start = timeit.default_timer()
    ps = PorterStemmer()
    file = open(filename,"r")
    tokens = []

    #Used for removing punctuation from the documents
    translate_table = dict((ord(char), None) for char in string.punctuation)

    start2 = timeit.default_timer()
    #splits the lines into words and removes the punctuation
    for line in file:
        tokens += word_tokenize(line.translate(translate_table)  )
    start3 = timeit.default_timer()
    print("tokenize")
    print(start3 - start2)
        
    #creates a set of stop words to be removed later
    stop_words = set(stopwords.words("english"))

    start6 = timeit.default_timer()
    #if a word is not a stop word it adds it to a list 
    filtered_sentence = []
    for w in tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    start7 = timeit.default_timer()
    print("stop word removal")
    print(start7 - start6)

    startw = timeit.default_timer()    
    #stems each word and adds it to the output file in csv form
    f = open(output,'w')
    iterFilSen = iter(filtered_sentence)
    if output == "documents.csv":
        for w in filtered_sentence:
            if w == "I":
                f.write("\n")
            f.write(ps.stem(w))
            f.write(",")
    else:
        for w in iterFilSen:
            if w == "I":
                f.write("\n")
                #removes the I number W
                next(iterFilSen)
                next(iterFilSen)
            else:
                f.write(ps.stem(w))
                f.write(",")
            
        
    #ends run time
    stop = timeit.default_timer()
    print("writing")
    print(stop - startw)
    print("total: "+output)
    print(stop - start)
示例#2
0
文件: sentence.py 项目: Rostlab/relna
class StemmedBagOfWordsFeatureGenerator(EdgeFeatureGenerator):
    """
    Generates stemmed Bag of Words representation for each sentence that contains
    an edge, using the function given in the argument.

    By default it uses Porter stemmer

    :type feature_set: nala.structures.data.FeatureDictionary
    :type stemmer: nltk.stem.PorterStemmer
    :type stop_words: list[str]
    :type training_mode: bool
    """

    def __init__(self, feature_set, stop_words=[], training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.stemmer = PorterStemmer()
        """an instance of the PorterStemmer"""
        self.stop_words = stop_words
        """a list of stop words"""

    def generate(self, dataset):
        for edge in dataset.edges():
            sentence = edge.part.sentences[edge.sentence_id]
            if self.training_mode:
                for token in sentence:
                    if self.stemmer.stem(
                            token.word
                    ) not in self.stop_words and not token.features['is_punct']:
                        feature_name = '4_bow_stem_' + self.stemmer.stem(
                            token.word) + '_[0]'
                        self.add_to_feature_set(edge, feature_name)
示例#3
0
def stem(string):
    """Stem a phrase"""
    global stemmer
    if not stemmer:
        stemmer = Stemmer()
    #words = string.split()
    #for i in range(len(words)):
    #    words[i] = self.stemmer.stem(words[i])
    # stemming last word only
    #string = self._reGlue(words)
    #
    #string2 = stemmer.stem(string)
    #if string2 not in stemdict:
    #    stemdict[string2] = string
    # FIX ME
    if string not in stemdict:
        if bad_unicode(string):
            ## added A. Meyers 8/28/15
            temp = stemmer.stem(remove_non_unicode(string))
        else:
            temp = stemmer.stem(string)
        if temp:
            stemdict[string] = temp
        if not temp:
            pass
        elif temp not in unstemdict:
            unstemdict[temp] = [string]
        elif string not in unstemdict[temp]:
            unstemdict[temp].append(string)
    else:
        temp = stemdict[string]
    return temp
def tokenizeTags(str,dict_items):
    #temp map (for getting the local term frequency)
    #for a sentence
    str =str.decode('ascii', 'ignore')
    #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    #tokens=tokenizer.tokenize(str)
    tokens = str.split()
    #print tokens
    stemmer = PorterStemmer()
    #small set of stopwords (remove you, are, and, I those kinds of words)
    last =[]
    #bigram_list=[]
    for d in tokens:
        d = d.split('-')
        for c in d:
                c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
                #regular expression -> strip punctuations
                if c!='' and c not in dict_items:
                    try:
                        if int(c):
                            if len(c)!=4 and (c>2015 or c<1900): #keep years
                                c=stemmer.stem('NUM')
                    except Exception:
                        c = stemmer.stem(c.lower())
                        pass
                    #c = stemmer.stem(c.lower())
                    last.append(c)
                    #bigram generation
                #index= len(last)
                #if index>1:
                   # bigram = last[index-2]+' '+last[index-1]
                   # bigram_list.append(bigram)
    return last
示例#5
0
def new_lesk(context_sentence, ambiguous_word, pos=None, stem=True, hyperhypo=True):
    ps = PorterStemmer()
    max_overlaps = 0; lesk_sense = None
    context_sentence = context_sentence.split()
    for ss in wn.synsets(ambiguous_word):
        # If POS is specified.
        if pos and ss.pos is not pos:
            continue

        lesk_dictionary = []

        # Includes definition.
        lesk_dictionary+= ss.definition.split()
        # Includes lemma_names.
        lesk_dictionary+= ss.lemma_names

        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            lesk_dictionary+= list(chain(*[i.lemma_names for i in ss.hypernyms()+ss.hyponyms()]))

        if stem == True: # Matching exact words causes sparsity, so lets match stems.
            lesk_dictionary = [ps.stem(i) for i in lesk_dictionary]
            context_sentence = [ps.stem(i) for i in context_sentence]

        overlaps = set(lesk_dictionary).intersection(context_sentence)

        if len(overlaps) > max_overlaps:
            lesk_sense = ss
            max_overlaps = len(overlaps)
    return lesk_sense
def tokenize2_bigram(str,df_freq):
    temp_map={}
    #for a sentence
    str =str.decode('ascii', 'ignore')
    tokens = str.split()
    #print tokens
    stemmer = PorterStemmer()
    last =[]
    bigram_list=[]
    for d in tokens:
        d = d.split('-')
        for c in d:
                c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
                #regular expression -> strip punctuations
                if c!='':
                    try:
                        if int(c):
                            if len(c)!=4 and (c>2015 or c<1900): #keep years
                                c=stemmer.stem('NUM')
                    except Exception:
                        c = stemmer.stem(c.lower())
                        pass
                    
                    #c = stemmer.stem(c.lower())
                    last.append(c)
                    
                    #bigram generation
                index= 0
                if index>1:
                    bigram = last[index-2]+' '+last[index-1]
                    bigram_list.append(bigram)
                    updateDF(temp_map,df_freq,bigram)
                    index+=1
    return bigram_list
def tokenize2(str,df_freq):
    #temp map (for getting the local term frequency)
    temp_map={}
    #for a sentence
    str =str.decode('ascii', 'ignore')
    #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    #tokens=tokenizer.tokenize(str)
    tokens = str.split()
    #print tokens
    stemmer = PorterStemmer()
    #small set of stopwords (remove you, are, and, I those kinds of words)
    
    
    last =[]
    #bigram_list=[]
    for d in tokens:
        d = d.split('-')
        for c in d:
            c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
                #regular expression -> strip punctuations
                if c!='':
                    try:
                        if int(c):
                            if len(c)!=4 and (c>2015 or c<1900): #keep years
                                c=stemmer.stem('NUM')
                    except Exception:
                        c = stemmer.stem(c.lower())
                        pass
                    
                    last.append(c)
                    updateDF(temp_map,df_freq,c)
示例#8
0
文件: context.py 项目: Rostlab/relna
class IntermediateTokensFeatureGenerator(EdgeFeatureGenerator):
    """
    Generate the bag of words representation, masked text, stemmed text and
    parts of speech tag for each of the tokens present between two entities in
    an edge.

    :param feature_set: the feature set for the dataset
    :type feature_set: nala.structures.data.FeatureDictionary
    :param training_mode: indicates whether the mode is training or testing
    :type training_mode: bool
    """
    def __init__(self, feature_set, training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.stemmer = PorterStemmer()
        """an instance of PorterStemmer"""

    def generate(self, dataset):
        for edge in dataset.edges():
            sentence = edge.part.sentences[edge.sentence_id]
            if edge.entity1.head_token.features['id'] < edge.entity2.head_token.features['id']:
                first = edge.entity1.head_token.features['id']
                second = edge.entity2.head_token.features['id']
                for i in range(first+1, second):
                    token = sentence[i]
                    feature_name = '33_fwd_bow_intermediate_'+token.word+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '34_fwd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '35_fwd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '36_fwd_pos_intermediate_'+token.features['pos']+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
            else:
                first = edge.entity2.head_token.features['id']
                second = edge.entity1.head_token.features['id']
                for i in range(first+1, second):
                    token = sentence[i]
                    feature_name = '37_bkd_bow_intermediate_'+token.word+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '38_bkd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '39_bkd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '40_bkd_pos_intermediate_'+token.features['pos']+'_[0]'
                    self.add_to_feature_set(edge, feature_name)

            for i in range(first+1, second):
                token = sentence[i]
                feature_name = '41_bow_intermediate_'+token.word+'_[0]'
                self.add_to_feature_set(edge, feature_name)
                feature_name = '42_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
                self.add_to_feature_set(edge, feature_name)
                feature_name = '43_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
                self.add_to_feature_set(edge, feature_name)
                feature_name = '44_pos_intermediate_'+token.features['pos']+'_[0]'
                self.add_to_feature_set(edge, feature_name)
示例#9
0
class EntityHeadTokenFeatureGenerator(EdgeFeatureGenerator):
    """
    Calculate the head token for each entity, using a simple heuristic - the
    distance to the root of the sentence.

    If the entity has just one token, then that forms the head token.
    If the entity has multiple tokens, then the token which is closest to the
    root of the sentence forms the entity head.

    :param feature_set: the feature set for the dataset
    :type feature_set: nala.structures.data.FeatureDictionary
    :param training_mode: whether the mode is training or testing, default True
    :type training_mode: bool
    """
    def __init__(self, feature_set, training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.stemmer = PorterStemmer()
        """an instance of the PorterStemmer"""

    def generate(self, dataset):
        for edge in dataset.edges():
            entity1 = edge.entity1
            entity2 = edge.entity2

            self.named_entity_count('entity1_', entity1.class_id, edge)
            self.named_entity_count('entity2_', entity2.class_id, edge)

            entity1_stem = self.stemmer.stem(entity1.head_token.word)
            entity1_non_stem = entity1.head_token.word[len(entity1_stem):]
            entity2_stem = self.stemmer.stem(entity2.head_token.word)
            entity2_non_stem = entity1.head_token.word[len(entity2_stem):]

            feature_name_1_1 = '7_entity1_txt_' + entity1.head_token.word + '_[0]'
            feature_name_2_1 = '7_entity2_txt_' + entity2.head_token.word + '_[0]'
            feature_name_1_2 = '8_entity1_pos_' + entity1.head_token.features['pos'] + '_[0]'
            feature_name_2_2 = '8_entity2_pos_' + entity2.head_token.features['pos'] + '_[0]'
            feature_name_1_3 = '9_entity1_stem_' + entity1_stem + '_[0]'
            feature_name_2_3 = '9_entity2_stem_' + entity2_stem + '_[0]'
            feature_name_1_4 = '10_entity1_nonstem_' + entity1_non_stem + '_[0]'
            feature_name_2_4 = '10_entity2_nonstem_' + entity2_non_stem + '_[0]'

            self.add_to_feature_set(edge, feature_name_1_1)
            self.add_to_feature_set(edge, feature_name_2_1)
            self.add_to_feature_set(edge, feature_name_1_2)
            self.add_to_feature_set(edge, feature_name_2_2)
            self.add_to_feature_set(edge, feature_name_1_3)
            self.add_to_feature_set(edge, feature_name_2_3)
            self.add_to_feature_set(edge, feature_name_1_4)
            self.add_to_feature_set(edge, feature_name_2_4)

    def named_entity_count(self, prefix, entity_type, edge):
        entities = edge.part.get_entities_in_sentence(edge.sentence_id, entity_type)
        feature_name = '1_'+prefix+entity_type+'_count_['+str(len(entities))+']'
        self.add_to_feature_set(edge, feature_name)
示例#10
0
文件: indexer.py 项目: kellino/UCrawL
class Indexer():
    def __init__(self, rem_punc=True, rem_stop=True):
        self.rem_punc = rem_punc
        self.rem_stop = rem_stop
        self.stoplist = stopwords.words('english')
        self.punctunation = list(string.punctuation)
        self.token_dict = dict()
        self.pst = PorterStemmer()
        self.postings_list = dict()

    def get_pages(self):
        with open('./data/ucl', 'r') as ifile:
            contents = ifile.read()
            for page in contents.split('visited:'):
                self.parse_page(page)

    def parse_page(self, page):
        page = unicode(page, errors='ignore')
        lines = page.strip().split()
        if len(lines) > 2:
            title = lines[1]
            # tokenize and make lowercase
            tokens = [word.lower() for word in word_tokenize(str(lines[2:]))]
            # remove punctuation
            if self.rem_punc:
                tokens = [word for word in tokens if word not in self.punctunation]
            # remove stopwords
            if self.rem_stop:
                tokens = [word for word in tokens if word not in self.stoplist]
            # stem (Porter stemmer)
            tokens = [self.pst.stem(word) for word in tokens]
            # add to dictionary
            self.add_to_token_dict(title, tokens[3:])

    def add_to_token_dict(self, title, tokens):
        if tokens:
            words = dict()
            for token in tokens[1:]:
                key = self.pst.stem(token.lower())
                if key in self.token_dict:
                    self.token_dict[key] += 1
                else:
                    self.token_dict[key] = 1
                if key in words:
                    words[key] += 1
                else:
                    words[key] = 1
            self.postings_list[title] = [(k, v) for k, v in words.iteritems()]
def splitAndStem(inputfilename, outputfilename):
    '''
    For each ingredient split it into words, stem each word, construct a new recipe from those words
    :param inputfilename:
    :return:
    '''


    with open(outputfilename, 'w') as ff:
        ff.write('[\n')

    with open(inputfilename) as f:
        d = eval(f.read())

    stemmer = PorterStemmer()
    with open(outputfilename, 'a') as ff:
        for i in d:
            # print(i)
            new_item = {}
            new_ingredients = []
            for ingredient in i['ingredients']:
                tokens = word_tokenize(ingredient)
                clean_tokens = [re.subn('[^A-Za-z]', '', token)[0] for token in tokens]
                new_ingredients += [stemmer.stem(w).lower() for w in clean_tokens]
            new_item['cuisine'] = i['cuisine']
            new_item['id'] = i['id']
            new_item['ingredients'] = new_ingredients
            json_recipe = json.dumps(new_item)
            ff.write('%s,\n' % str(json_recipe))
示例#12
0
def porter_list1(lista):
    stemmer = PorterStemmer()
    newlist = []
    for b in lista:
        b = stemmer.stem(b)
        newlist.append(b)
    return newlist
示例#13
0
def parseReviews(mypath):
  filelist = os.listdir(mypath) 
  wordDict = {}
  negationList = ["no","not","never","can't","won't","cannot","didn't","couldn't"]
  negationFlag = False
  stopwordList = set(stopwords.words("english"))
  stemmer = PorterStemmer()
  for file in filelist:
    with open(mypath + "/" + file,"r") as f:
      word_list = word_tokenize(f.read())
    for word in word_list:
      if word in negationList:
        #double negative
        if negationFlag:
          negationFlag = False
        else:
          negationFlag = True
        continue
      if not word.isalnum():
        negationFlag = False
      if word.isalnum() and word not in stopwordList:
        word = stemmer.stem(word)
        if negationFlag:
          word = "!" + word
          negationFlag = False
        if word not in wordDict:
          wordDict[word] = 1
        else:
          wordDict[word] += 1
  return wordDict
示例#14
0
def prepare_data(reviews):
    # run porter stemmer on every word
    stemmer = PorterStemmer()
    stem_text = lambda x: {'class': x['class'],
                           'text': stemmer.stem(x['text'])}

    # clean text and remove empty items
    reviews = filter(lambda x: x != {}, reviews)
    reviews = map(stem_text, reviews)

    print('classification: ' + reviews[observed_element]['class'] + '\n\n------------------------------------\n\n')

    print('stemming: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')

    # remove stopwords
    reviews = map(remove_stop_words, reviews)

    print('stopwords: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')

    # remove undesired patterns
    reviews = map(clean_text, reviews)

    print('elementos inuteis: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')

    return reviews
def deleting_stop_words_and_punctuating(text):
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    word_tokenize_text = word_tokenize(text)
    words = [ps.stem(lemmatizer.lemmatize(w)) for w in word_tokenize_text]
    return [w.lower() for w in words if not (w in stop_words or w in string.punctuation or w in "''" or w in '``' or w in "the" or w in 'in' or w in "'s")]
示例#16
0
class Document(object):
    def __init__(self, title, raw, stopwords=set()):
        self.title = title
        self.raw = raw
        self.stops = stopwords
        self.docid = 0
        self.tokens = self._tokenize()
        self.stemmer = PorterStemmer()
        self.terms = self._get_terms()
        self.log_terms = self._log_terms()
        self.magnitude = 0
    
    def _tokenize(self):
        """Takes the raw terms and returns the tokenized contents"""
        return wpt(self.raw)
                
    def _get_terms(self):
        """Gets a freqdist of the standardized terms from the list of tokens
        Developer's note: this is where I would put stopwords
        """
        stems = []
        for token in self.tokens:
            if (not token.isalnum()) or (token in self.stops) or (token.isdigit()):
                continue
            stemmed = self.stemmer.stem(token)
            stems.append(stemmed.lower())
        return FreqDist(stems)
    
    def _log_terms(self):
        return dict((term, (1+log(freq, 2))) for term, freq in self.terms.items())
    
    def __len__(self):
        """This returns the number of terms in the document. NOT the size of it.
        """
        return len(self.terms)
示例#17
0
def preprocess(text):
  stemmer = PorterStemmer()
  stop = stopwords.words('english')
  tokens = [tok for tok in word_tokenize(text.lower())
    if tok not in stop]
  tokens_stemmed = [stemmer.stem(tok) for tok in tokens]
  return tokens_stemmed    
示例#18
0
def preprocess_document(doc):
  stopset = set(stopwords.words('english'))
  stemmer = PorterStemmer()
  tokens = wordpunct_tokenize(doc)
  clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
  final = [stemmer.stem(word) for word in clean]
  return final
示例#19
0
文件: utils.py 项目: Muugii-bs/hommie
def preprocessing(text, debug = False):
    if debug:
        print text

    # lower case
    text = text.lower()
    if debug:
        print text

    # can't -> cannot, bya's -> bya is
    text = replacers.RegexpReplacer().replace(text)
    if debug:
        print text

    # word tokenize
    words = word_tokenize(text)
    if debug:
        print words

    # removing stopwords
    english_stops = set(stopwords.words('english'))
    english_stops_added = english_stops | {'.', ',', ':', ';'}
    words = [word for word in words if word not in english_stops_added]
    if debug:
        print words

    # stemming words
    stemmer = PorterStemmer()
    words_stemmed = list(map(lambda word: stemmer.stem(word), words))
    if debug:
        print words_stemmed

    return words, words_stemmed
示例#20
0
    def extract_clean_sentences(self):
        """
        Extracts sentences from plain text. Also applies the following cleaning
        operations:
        - Exclude all characters not recognized by 'utf-8' encoding
        - Exclude all characters not contained in [a-zA-Z0-9 '-]
        - Exclude common stopwords
        """

        text = self.raw_text
        
        exclude = re.compile('[^a-zA-Z0-9 \'-]')
        linebreaks = re.compile('\s')
        excess_space = re.compile('\s+')
        stemmer = PorterStemmer()

        sentences = sent_tokenize(text)
        out = []
        for sentence in sentences:
            sentence = linebreaks.sub(' ', sentence)
            sentence = exclude.sub(' ', sentence)
            sentence = excess_space.sub(' ', sentence)
            tokens = word_tokenize(sentence)
            tokens = [stemmer.stem(t.lower()) for t in tokens]
            out.append(tokens)

        return out
def testing():
    # - tokenize on sentence and word
    ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!"
    print(sent_tokenize(ex_txt))
    print(word_tokenize(ex_txt, language='english'))

    # - stop words (pre-defined by nltk)
    stop_words = set(stopwords.words('english'))
    print(stop_words)
    words = word_tokenize(ex_txt)
    print(words)
    filtered_sent = []
    for w in words:
        if w not in stop_words:
            filtered_sent.append(w)
    print(filtered_sent)
    filtered_sent = [w for w in words if not w in stop_words]
    print(filtered_sent)

    # - stemming
    ps = PorterStemmer()
    example_words = [python,pythoner,pythoning,pythoned,pythonly]
    # for w in example_words:
    #     print(ps.stem(w))
    new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
    words = word_tokenize(new_text)
    for w in words:
        print(ps.stem(w))
示例#22
0
def stemText(text):

    ps = PorterStemmer()
    words = word_tokenize(text)
    #all_words = [];
    for w in words:
        all_words.append(ps.stem(w))
示例#23
0
def parseTranscript(transcript):

    assert isinstance(transcript, Transcript), \
        "transcript must be stored in custom namedtuple, not {}".format(type(transcript))

    text = transcript.prepared.append(transcript.QandA)
    id = "{ticker}-{year}-{month}-{day}".format(ticker=transcript.ticker.split(':')[-1],
                                                year=transcript.date.year,
                                                month=transcript.date.month,
                                                day=transcript.date.day)

    tokenizer = wordpunct_tokenize
    stemmer = PorterStemmer()
    index = dict()
    pos = 0

    for row in text:

        for i, token in enumerate(tokenizer(row.lower())):
            token = stemmer.stem(token)
            if token not in index and '|' not in token:
                index[token] = [id, [str(pos + i)]]
            elif '|' not in token:
                index[token][-1].append(str(pos + i))

        try:
            pos += (i + 1)
        except:
            pass

    return index
示例#24
0
class Stemmer(SentenceProcesser):
    def __init__(self):
        self.stemmer=PorterStemmer()
    def process(self, sentence):
        for word in sentence.words:
            word.stem=self.stemmer.stem(word.content)
        return sentence
示例#25
0
文件: okreader.py 项目: ned2/okdata
def get_english_vocab(lemmatize=False):
    vocab = (w.lower() for w in words.words())

    if lemmatize:
        stemmer = PorterStemmer()
        vocab = (stemmer.stem(w) for w in vocab)
    return set(vocab)
示例#26
0
文件: RetKNN_MPRC.py 项目: w2wei/XPRC
 def buildVocab(self):
     '''Build a vocabulary for the selected documents (from dir database).'''
     ## Note: The source of text should be Lucene processed field values. Lucene tokenized the text, remove stop words, and may take other unknown steps.
     ## Right now the vocabulary is built on the raw text with NLTK based stopwords removal, and tokenization. This should be improved.
     # collect contents from /database/ for each of these doc
     for pmid in self.pmidList: # self.pmidList includes the query and the 99 most similar articles selected by BM25
         self.corpus.append(file(os.path.join(self.dbDir,pmid)).read()) # corpus contains raw text (MH, title*2, abstract)
     for text in self.corpus:
         sent_tokenize_list = sent_tokenize(text.strip().lower(), "english") # tokenize an article text
         stemmed_text = []
         if sent_tokenize_list: # if sent_tokenize_list is not empty
             porter_stemmer = PorterStemmer()
             for sent in sent_tokenize_list:
                 words = TreebankWordTokenizer().tokenize(sent) # tokenize the sentence
                 words = [word.strip(string.punctuation) for word in words]
                 words = [word for word in words if not word in stopwords.words("english")]               
                 words = [word for word in words if len(word)>1] # remove single letters and non alphabetic characters               
                 words = [word for word in words if re.search('[a-zA-Z]',word)]                        
                 words = [porter_stemmer.stem(word) for word in words] # apply Porter stemmer                     
                 stemmed_text.append(" ".join(words))
                 self.vocab+=words
         self.stemmed_corpus.append(". ".join(stemmed_text)) # append a stemmed article text
     # save stemmed corpus
     pickle.dump(self.stemmed_corpus, file(os.path.join(self.stemmed_corpusDir,str(self.pmidList[0])),"w"))
     # remove low frequency tokens and redundant tokens
     tokenDist = Counter(self.vocab)
     lowFreqList = []
     for token, count in tokenDist.iteritems():
         if count<2:
             lowFreqList.append(token)
     self.vocab = list(set(self.vocab)-set(lowFreqList))
     # save vocabulary
     pickle.dump(self.vocab,file(os.path.join(self.vocabDir,str(self.pmidList[0])),"w"))
class Tfidf_KeywordSelection:

    def __init__(self,keyword_count,stem=True):
        self.keyword_count = keyword_count
        self.stem = stem
        if self.stem:
            self.stemmer = PorterStemmer()

    def fit(self,X,y=None):
        return self

    def predict(self,X):
        if self.stem:
            for idx in xrange(len(X)):
                for idx_cand in xrange(len(X[idx])):
                    X[idx][idx_cand] = " ".join([self.stemmer.stem(word) for word in X[idx][idx_cand].split()])
        corpus_tfidf,dictionary = self.score_keyphrases_by_tfidf(X)
        ypred = []
        for scores in corpus_tfidf:
            scores = sorted(scores,key=lambda x:x[1],reverse=True)[:self.keyword_count]
            ypred.append([dictionary[word_idx] for word_idx,score in scores])
        return ypred


    def score_keyphrases_by_tfidf(self, candidates):
        # make gensim dictionary and corpus
        dictionary = gensim.corpora.Dictionary(candidates)
        corpus = [dictionary.doc2bow(candidate) for candidate in candidates]
        # transform corpus with tf*idf model
        tfidf = gensim.models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        return corpus_tfidf, dictionary
示例#28
0
    def _stemmatize(self, word):
        lmtzr = WordNetLemmatizer() # lemmatizer won't stem words ending in '-ing' unless you tell it it's a verb
        stemmer = PorterStemmer()

        if word.endswith('ing'):
            return stemmer.stem(word)
        return lmtzr.lemmatize(word)
def stemming():
    ps = PorterStemmer()
    input_tweet = 'testing tests trying tries'
    words = word_tokenize(input_tweet)

    for w in words:
     print(ps.stem(words))
示例#30
0
class Tokenizer:

    def __init__(self):
        self.relative_path = os.path.join("my_class/")
        self.stopword_list = csv_io.read_csv(self.relative_path + 'stopword.csv') + [u',']
        self.stemmer = PorterStemmer()

    def is_stop_word(self, word):
        if word.lower() in self.stopword_list:
            return True
        else:
            return False

    def stemming(self, token):
        return self.stemmer.stem(token)

    def to_tokens(self, sentence):
        return nltk.word_tokenize(sentence)

    def is_zh (self,c):
        c_unicode = ord (c)

        # unicode range
        zh_range = [[0x2e80, 0x33ff], [0xff00, 0xffef], [0x4e00, 0x9fbb], \
                    [0xf900, 0xfad9], [0x20000, 0x2a6d6], [0x2f800, 0x2fa1d]]
        for lower, upper in zh_range:
            if c_unicode >= lower and c_unicode <= upper:
                return True
        return False
示例#31
0
word_tokens = [word for word in word_tokens if word.isalnum()]
print("tokens");
print(word_tokens) 

stop_words = set(stopwords.words('english')) 
filtered_sentence = [] 
filtered_sentence = [w for w in word_tokens if not w in stop_words] 
print("tokens without stopwords");
print(filtered_sentence) 

fdist = FreqDist(filtered_sentence)
print(fdist)
fdist.plot(30,cumulative=False)
plt.show()

print("stemming")
stemmed_words = []
for word in filtered_sentence:
  stemmed_words.append(ps.stem(word))
print(stemmed_words)

print("lemma")
lemmed_words = []
for word in filtered_sentence:
  lemmed_words.append(lem.lemmatize(word,"v"))
print(lemmed_words)

print("pos")
pos_tags = nltk.pos_tag(filtered_sentence)
print(pos_tags)
示例#32
0
def stem(word):
    s = PorterStemmer()

    return s.stem(word)
示例#33
0
new_output = np.ones((output.shape[0], 2))
for i in range(output.shape[0]):
    if (output[i] == 1):
        new_output[i][0] = 1
        new_output[i][1] = 0

    else:
        new_output[i][0] = 0
        new_output[i][1] = 1

ps = PorterStemmer()

for i in range(len(l)):
    j = 0
    for j in range(len(l[i])):
        l[i][j] = ps.stem(l[i][j])

wordlist = {}
for i in range(len(l)):
    for j in range(len(l[i])):
        if (l[i][j] in wordlist):
            wordlist[l[i][j]] = wordlist[l[i][j]] + 1
        else:
            wordlist[l[i][j]] = 1

freq_l = sorted(wordlist.values())
freq_l.reverse()

freq_l[1999]

new_wordlist = {}
示例#34
0
        words = [word.lower() for word in words]
        #Loại bỏ hư từ và ghi ra file
        words = [word for word in words if word not in my_stopwords]

        arr_word = CountFrequency(words)

        file_after_remove_stopword = open(
            path_output + '/file_after_remove_stopword_' + str(i) + '.txt',
            "w",
            encoding="utf8")
        for (word, fre) in arr_word.items():
            file_after_remove_stopword.write(word + ':' + str(fre) + '\n')
        file_after_remove_stopword.close()
        #chuẩn hóa từ
        ps = PorterStemmer()
        words = [ps.stem(word) for word in words]
        list_word.append(words)

        #tạo mảng các document sau khi chuẩn hóa
        str_words = ' '.join(words)
        list_document_after_preprocess.append(str_words)

        #ghi file tổng kết
        len_words = len(words)
        file_summary.write(list_name[i] + ": " + str(len(sents_cleaned)) +
                           ", " + str(len_words) + '\n')

        arr_word = CountFrequency(words)
        file_final = open(path_output + '/' + list_name[i] + '_word.txt',
                          "w",
                          encoding="utf8")
示例#35
0
class Preprocess():
    ## this is for each document
    def __init__(self):
        self.inittoken_list = []
        self.http_dic = {"ALL": []}
        self.number_removed_list = []
        self.number_dic = {"ALL": []}  ## recorded in init order
        self.stemmed_list = []
        self.punctuation_list = [
            ".", "'", '"', "?", ",", ")", "(", "@", "%", "$", "*", "-", "_",
            "/", "!", "#", "^", "&", "`", ":", ";"
        ]
        self.poter = PorterStemmer()
        self.stopward_list = []

        self.stopward_dic = {"ALL": []}
        init_stopward_list = [
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
            "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
            'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
            'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
            'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
            'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
            'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
            'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
            'by', 'for', 'with', 'about', 'against', 'between', 'into',
            'through', 'during', 'before', 'after', 'above', 'below', 'to',
            'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
            'again', 'further', 'then', 'once', 'here', 'there', 'when',
            'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
            'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
            'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
            'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll',
            'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
            "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't",
            'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma',
            'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't",
            'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't",
            'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
        ]
        self.stopward_adding(init_stopward_list)
        self.stopwarded_list = []
        self.word_dic = {}  ## format:{term:[index_list]}

    def read_file(self, storage_place):
        #print(1)
        if len(self.inittoken_list) != 0:
            return "you have already put some data in here"
        ## vertify type of input
        if not isinstance(storage_place, str):
            print(
                "you should input where you store your document in string type."
            )
            return False
        ## make document in to a list of list of strings, seperated in lines
        storage_place = storage_place.strip("/")
        document_list = open(storage_place, 'rt').readlines()

        ## make document into a single list of string
        valid_index = 0
        for line in document_list:
            start_flag = 0
            for stop_flag in range(len(line)):
                valid_flag = False
                if line[stop_flag] == ' ':
                    word = line[start_flag:stop_flag]
                    valid_flag = self.preprocess_word(word, valid_index)
                    start_flag = stop_flag + 1
                if line[-1] == '.' and stop_flag == len(line) - 1:
                    # check the last word for each line
                    word = line[start_flag:-1]
                    valid_flag = self.preprocess_word(word, valid_index)

                if valid_flag:
                    ## flag is true if word is valid
                    valid_index = valid_index + 1

        return True

    def preprocess_word(self, word, valid_index):
        ## 應在這裡把 字串list、字典 建好
        if self.http_remove(word):
            return False
        if self.number_remove(word):
            return False
        #self.minus_split( voca_index)
        pun_removed = self.punctuation_remove(word)
        if self.len_filter(pun_removed):
            return False
        stemmed = self.stemming(pun_removed)
        if self.stopwording(stemmed):
            return False
        self.word_dic_create(stemmed, valid_index)

        return True

    def http_remove(self, word):
        ## true if this word is a website address, and add it into http_dic
        flag = 0

        if "http" == word[:4] or "www" == word[:3]:
            ## first 4 chars in word == http, or first 3 chars in word == www
            flag = 1

        self.http_index = 0

        if flag:
            if not word in self.http_dic:
                self.http_dic["ALL"].append(word)
                self.http_dic[word] = []
            self.http_dic[word].append(self.http_index)
            self.http_index = self.http_index + 1
            return True
        return False

    def number_remove(self, word):
        ## true if 're is number in the word, and add it into number_dic
        flag = 0
        for char in word:
            if ord(char) < 58 and ord(char) > 47:
                ## ASCII for numbers : 48~57
                flag = 1
                break
        self.number_index = 0
        if flag:
            if not word in self.number_dic:
                self.number_dic["ALL"].append(word)
                self.number_dic[word] = []
            self.number_dic[word].append(self.number_index)
            self.number_index = self.number_index + 1
            return True
        return False

    def punctuation_remove(self, word):
        for pun in self.punctuation_list:
            if pun in word:
                word = word.replace(pun, '')
        return word

    def len_filter(self, word):
        if len(word) < 3:
            return True
        return False

    def stemming(self, word):
        stemmed = self.poter.stem(word)
        return stemmed

    def stopwording(self, word):
        ## true if the word is stopword
        if word in self.stopward_dic:
            #self.stopward_dic['ALL'].append( voca_index)
            #self.stopward_dic[ dest_document[voca_index]].append(voca_index)
            return True

        ## add normal word into stopworded_list
        self.stopwarded_list.append(word)
        return False

    def word_dic_create(self, word, index):
        if word in self.word_dic:
            self.word_dic[word].append(index)
        else:
            self.word_dic[word] = [index]

        return True

    def minus_split(self):  ####### INCOMPLEPE #######
        for voca_index in range(len(self.inittoken_list)):
            if "-" in self.inittoken_list[voca_index]:
                temp = self.inittoken_list[voca_index].split("-")
                self.inittoken_list.append(temp)
                self.inittoken_list[voca_index] = self.inittoken_list[
                    voca_index].replace("-", "")
        return None

    def stopward_adding(self, new_ward_list):
        ## check for type of list
        if not isinstance(new_ward_list, list):
            print("want a list. in stopward_adding")
            return False
        for stopward in new_ward_list:
            ## check for type of each ward in list
            if not isinstance(stopward, str):
                print("want a list of string. in stopward_adding")
                return False
            ## stem and add
            stemmed_stopward = self.poter.stem(stopward)
            if not stemmed_stopward in self.stopward_dic:
                self.stopward_list.append(stemmed_stopward)
                self.stopward_dic.update({stemmed_stopward: []})
        #self.stopward_flag[0] = self.stopward_flag[0] +1
        return 0

    def punctuation_adding(self, new_pun):

        return 0

    def save_result(self):
        with open("R09725049_result.txt", "w") as text_file:
            text_file.write(str(self.stopwarded_list))
        return "file saved"
示例#36
0
train['tweet'] = train['tweet'].apply(
    lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

#Spelling correction
from textblob import TextBlob
train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

#Tokenization
TextBlob(train['tweet'][1]).words

#Stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['tweet'][:5].apply(
    lambda x: " ".join([st.stem(word) for word in x.split()]))

# Lemmatization
from textblob import Word
train['tweet'] = train['tweet'].apply(
    lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['tweet'].head()

#. Advance Text Processing
# N-grams
# N-grams are the combination of multiple words used together. Ngrams with N=1 are called unigrams. Similarly, bigrams (N=2), trigrams (N=3) and so on can also be used.
# Unigrams do not usually contain as much information as compared to bigrams and trigrams. The basic principle behind n-grams is that they capture the language structure,
# like what letter or word is likely to follow the given one. The longer the n-gram (the higher the n), the more context you have to work with. Optimum length really
# depends on the application – if your n-grams are too short, you may fail to capture important differences. On the other hand, if they are too long, you may fail
# to capture the “general knowledge” and only stick to particular cases.
示例#37
0
input_str = input_str.lower()
input_str = input_str.translate(str.maketrans('', '', string.punctuation))
result = re.sub("\d", "", input_str)
print(result)

from nltk.tokenize import word_tokenize
tokens = word_tokenize(input_str)
print(tokens)

from nltk import FreqDist
frequency_token = nltk.FreqDist(tokens)
print(frequency_token.most_common(10))

input_str = word_tokenize(input_str)
for word in input_str:
    print(stemmer.stem(word))

from sklearn.feature_extraction.text import CountVectorizer


def bow_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features


bow_vectorizer, bow_features = bow_extractor(CORPUS)
features = bow_features.todense()
print(features)

feature_names = bow_vectorizer.get_feature_names()
示例#38
0
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

ps = PorterStemmer()

words = ['walk', 'walking', 'walked', 'walks']
for word in words:
	print(ps.stem(word))

sentence = "I walked in a park. The weather was good. I saw some children playing football. And a dog was chasing a cat."

tokens = word_tokenize(sentence)
for token in tokens:
	print(ps.stem(token))
示例#39
0
class Preprocessor():
    def __init__(self, *filenames):
        self.entries = Parser.parse(*filenames)
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.nchars = {}
        self.docsLength = np.empty(len(self.entries))

        self.min_lower_year = None
        self.min_upper_year = None
        self.max_lower_year = None
        self.max_upper_year = None

    # used in data analyzer
    def clean_entries(self):
        for entry in self.entries:
            entry_only_words = self.__extract_only_words(entry.body)
            words = entry_only_words.lower().split()
            # if word is not in stopwords, stemm it and save it
            cleaned = [
                self.stemmer.stem(word) for word in words
                if word not in self.stop_words
            ]
            # create string from words that passed exam above (words that are not stopwords)
            entry.body = " ".join(cleaned)

    # used in main funcs
    def get_clean_data(self):
        entries_text = [entry.body for entry in self.entries]

        # "deletes" everything that is not a word or number (?, !, '...)
        entries_only_words = [
            self.__extract_only_words(entry_text)
            for entry_text in entries_text
        ]

        # final clean - performs removing stopwords and stemms every word
        clean_entries = []
        for i, entry_only_words in enumerate(entries_only_words):
            words = entry_only_words.lower().split()
            # getDocLength, use all words (this line can be moved around if other lengths (e.g. no stopwords) are used)
            # self.docsLength[i] = len(words) --> normalization done directly in SvenClassifier, there are no stop words
            # if word is not in stopwords, stemm it and save it
            cleaned = [
                self.stemmer.stem(word) for word in words
                if word not in self.stop_words
            ]
            # create string from words that passed exam above (words that are not stopwords)
            clean_entries.append(" ".join(cleaned))

        # list of strings separated by space, every string is cleaned text from one text entry in dataset
        return clean_entries

    def get_raw_words(self):
        entries_text = [entry.body.lower() for entry in self.entries]
        return [
            self.__extract_only_words(entry_text)
            for entry_text in entries_text
        ]

    def __extract_only_words(self, entry_text):
        return re.sub("[^a-zA-Z0-9]", " ", entry_text)

    def __calcNChars(self, words, sizes):
        for word in words:
            for size in sizes:
                nchars = [
                    word[i:i + size] for i in range(len(word) - size + 1)
                ]
                for nchar in nchars:
                    self.nchars[nchar] = self.nchars.get(nchar, 0) + 1

    def getNChars(self, items, sizes=(2, 3), freq=1):
        entries_text = [entry.body for entry in self.entries]
        entries_only_words = [
            self.__extract_only_words(entry_text)
            for entry_text in entries_text
        ]
        for item in entries_only_words:
            self.__calcNChars(item.split(), sizes=sizes)
        return [
            nchar for nchar, nchar_freq in self.nchars.items()
            if nchar_freq > freq
        ]

    def labels_for_years(self, year_type):
        text_periods = self.__get_text_periods(year_type)
        labels_lower = []
        labels_upper = []
        for text_period in text_periods:
            time_span = text_period.yes_time_span()
            labels_lower.append(time_span[LOWER])
            labels_upper.append(time_span[UPPER])
        return labels_lower, labels_upper

    def labels_for_years_norm(self, year_type):
        text_periods = self.__get_text_periods(year_type)
        labels_lower = []
        labels_upper = []
        time_span_length = self.__get_time_span_length(text_periods)
        custom_time_spans = self.__generate_custom_time_spans(time_span_length)
        for text_period in text_periods:
            time_span = text_period.yes_time_span()
            chosen_time_span = self.__find_starting_year(
                time_span, custom_time_spans)
            labels_lower.append(chosen_time_span[LOWER])
            labels_upper.append(chosen_time_span[UPPER])
        return labels_lower, labels_upper

    def __find_starting_year(self, time_span, custom_time_spans):
        intersecs = []  # intersections (amount of years)
        for custom_time_span in custom_time_spans:
            intersec = min(time_span[UPPER], custom_time_span[UPPER]) - max(
                time_span[LOWER], custom_time_span[LOWER])
            intersecs.append(intersec)
        # take time span for which intersection is largest
        return custom_time_spans[np.argmax(intersecs)]

    def __generate_custom_time_spans(self, time_span_length):
        start = 1700
        spans = []
        while start <= 2012:
            spans.append((start, start + time_span_length))
            start += time_span_length + 1
        return spans

    def __get_time_span_length(self, text_periods):
        time_span = text_periods[0].yes_time_span()
        return time_span[1] - time_span[0]

    def __get_text_periods(self, year_type):
        if year_type is "F":
            return [entry.textF for entry in self.entries]
        if year_type is "C":
            return [entry.textC for entry in self.entries]
        if year_type is "M":
            return [entry.textM for entry in self.entries]
示例#40
0
    if dataset['cEXT'][i] == 'n' and dataset['cNEU'][i] == 'n' and dataset['cAGR'][i] == 'n'and dataset['cCON'][i] == 'n' and dataset['cOPN'][i] == 'n':
        indices.append(i)

dataset.drop(dataset.index[indices], inplace=True)
dataset = dataset.reset_index(drop=True)

all_essays = []

for i in range(0, len(dataset['TEXT'])):
    essay = re.sub('a-zA-Z', ' ', dataset['TEXT'][i])
    essay = essay.lower()
    essay = essay.split()
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()

    essay = [wnl.lemmatize(word) if wnl.lemmatize(word).endswith('e') else ps.stem(word) for word in essay if not word in set(stopwords.words())]
    essay = ' '.join(essay)
    all_essays.append(essay)

    print("Done " + str(i))

with open("essaysfinal", "wb") as fp:
    pickle.dump(all_essays, fp)

complete_ds = []
y_req = []

y = dataset.iloc[:, 2:7].values
for d in range(0, len(y)):
    for i in range(0, len(y[0])):
        if y[d][i] == 'y':
示例#41
0
class PythonRouge(ReferenceBasedMetric):
    _non_alphanumeric_regex = re.compile('[^A-Za-z0-9]')

    def __init__(
            self,
            ngram_orders: List[int] = [1, 2],
            max_sentences: Optional[int] = None,
            max_words: Optional[int] = None,
            max_bytes: Optional[int] = None,
            use_porter_stemmer: bool = True,
            remove_stopwords: bool = False,
            compute_rouge_l: bool = False,
            rouge_data_dir: str = f'{DATA_ROOT}/metrics/ROUGE-1.5.5/data'):
        super().__init__()
        self.ngram_orders = ngram_orders
        self.max_sentences = max_sentences
        self.max_words = max_words
        self.max_bytes = max_bytes
        self.use_porter_stemmer = use_porter_stemmer
        self.remove_stopwords = remove_stopwords
        self.compute_rouge_l = compute_rouge_l

        if not os.path.exists(rouge_data_dir):
            raise Exception(
                f'Path "{rouge_data_dir}" does not exist. PythonRouge requires data files from ROUGE. '
                f'Have you setup ROUGE?')

        self.stemmer = PorterStemmer(PorterStemmer.ORIGINAL_ALGORITHM)
        self.stemmer_exceptions = self._load_stemmer_exceptions(rouge_data_dir)
        self.stopwords = self._load_stopwords(rouge_data_dir)

    def _load_stemmer_exceptions(self, root: str) -> Dict[str, str]:
        exceptions = {}
        for filename in ['adj.exc', 'adv.exc', 'noun.exc', 'verb.exc']:
            file_path = os.path.join(root, 'WordNet-2.0-Exceptions', filename)
            with open(file_path, 'r') as f:
                for line in f:
                    # I think there is a bug in the original perl script
                    # to construct the exceptions database. Some of the lines
                    # have more than 2 words on them, but the script only
                    # maps the first to the second, ignoring the third.
                    columns = line.strip().split()
                    exceptions[columns[0]] = columns[1]
        return exceptions

    def _load_stopwords(self, root: str) -> Set[str]:
        file_path = os.path.join(root, 'smart_common_words.txt')
        return set(open(file_path, 'r').read().splitlines())

    def normalize_and_tokenize_sentence(self, sentence: str) -> List[str]:
        sentence = PythonRouge._non_alphanumeric_regex.sub(' ', sentence)
        sentence = sentence.lower()
        tokens = []
        for token in sentence.split():
            if self.remove_stopwords and token in self.stopwords:
                continue
            if self.use_porter_stemmer and len(token) > 3:
                if token in self.stemmer_exceptions:
                    tokens.append(self.stemmer_exceptions[token])
                else:
                    tokens.append(self.stemmer.stem(token))
            else:
                tokens.append(token)
        return tokens

    def _normalize_and_tokenize_summary(self, summary: List[str]) -> List[str]:
        return [
            self.normalize_and_tokenize_sentence(sentence)
            for sentence in summary
        ]

    def preprocess_summary(self, summary: SummaryType) -> List[List[str]]:
        summary = shorten_summary(summary, self.max_sentences, self.max_words,
                                  self.max_bytes)
        summary = self._normalize_and_tokenize_summary(summary)
        return summary

    def _count_ngrams(self, summary: SummaryType, n: int) -> Counter:
        counts = Counter()
        if isinstance(summary, str):
            summary = [summary]
        tokens = [token for sentence in summary for token in sentence]
        for i in range(len(tokens) - n + 1):
            ngram = ' '.join(tokens[i:i + n])
            counts[ngram] += 1
        return counts

    def _calculate_intersection(
            self, reference_counts: Counter,
            summary_counts: Counter) -> Tuple[float, float, float]:
        reference_total = sum(reference_counts.values())
        summary_total = sum(summary_counts.values())
        intersection = 0
        for ngram in summary_counts:
            intersection += min(summary_counts[ngram], reference_counts[ngram])
        return reference_total, summary_total, intersection

    def _calculate_pr_f1(self, reference_total: int, summary_total: int,
                         intersection: int) -> Tuple[float, float, float]:
        precision = 0.0
        if summary_total != 0.0:
            precision = intersection / summary_total * 100
        recall = 0.0
        if reference_total != 0.0:
            recall = intersection / reference_total * 100
        if precision + recall == 0:
            f1 = 0.0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        return precision, recall, f1

    def _longest_common_substring(self, tokens1: List[str], tokens2: List[str],
                                  hit_mask: List[int]) -> int:
        m, n = len(tokens1), len(tokens2)
        counter = [[0] * (n + 1) for x in range(m + 1)]
        pointers = [[None] * (n + 1) for x in range(m + 1)]
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if tokens1[i - 1] == tokens2[j - 1]:
                    counter[i][j] = counter[i - 1][j - 1] + 1
                    pointers[i][j] = '\\'
                elif counter[i - 1][j] >= counter[i][j - 1]:
                    counter[i][j] = counter[i - 1][j]
                    pointers[i][j] = '^'
                else:
                    counter[i][j] = counter[i][j - 1]
                    pointers[i][j] = '<'

        # Mark the hit_mask
        i, j = m, n
        while i != 0 and j != 0:
            if pointers[i][j] == '\\':
                i -= 1
                j -= 1
                hit_mask[i] = 1
            elif pointers[i][j] == '^':
                i -= 1
            elif pointers[i][j] == '<':
                j -= 1
            else:
                raise Exception(f'Unknown pointer: {pointers[i][j]}')

    def _calculate_rouge_l(self, references: List[SummaryType],
                           summary: SummaryType):
        model_unigrams = self._count_ngrams(summary, 1)
        num_model_unigrams = sum(count for count in model_unigrams.values())

        if isinstance(summary, str):
            summary = [summary]
        references = [[reference] if isinstance(reference, str) else reference
                      for reference in references]

        total_hit = 0
        total_base = 0
        for reference in references:
            temp_model_unigrams = Counter(model_unigrams)
            gold_unigrams = self._count_ngrams(reference, 1)
            hit, base = 0, 0
            for ref_sentence in reference:
                hit_mask = [0] * len(ref_sentence)
                base += len(ref_sentence)
                for model_sentence in summary:
                    self._longest_common_substring(ref_sentence,
                                                   model_sentence, hit_mask)

                for i, token in enumerate(ref_sentence):
                    if hit_mask[i] == 1:
                        try:
                            if temp_model_unigrams[
                                    token] > 0 and gold_unigrams[token] > 0:
                                hit += 1
                                temp_model_unigrams[token] -= 1
                                gold_unigrams[token] -= 1
                        except KeyError:
                            pass
            total_hit += hit
            total_base += base

        precision = 0.0
        if (num_model_unigrams * len(references)) != 0.0:
            precision = total_hit / (num_model_unigrams *
                                     len(references)) * 100
        recall = 0.0
        if total_base != 0.0:
            recall = total_hit / total_base * 100
        if (precision + recall) != 0.0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0.0
        return precision, recall, f1

    def score_multi_all(
            self, summaries_list: List[List[SummaryType]],
            references_list: List[List[ReferenceType]]
    ) -> List[List[MetricsDict]]:
        summaries_list = [[
            self.preprocess_summary(summary) for summary in summaries
        ] for summaries in summaries_list]
        references_list = [[
            self.preprocess_summary(reference) for reference in references
        ] for references in references_list]

        metrics_lists = []
        for summaries, references in zip(summaries_list, references_list):
            metrics_list = [MetricsDict() for _ in summaries]

            for n in self.ngram_orders:
                reference_ngrams_list = [
                    self._count_ngrams(reference, n)
                    for reference in references
                ]

                for i, summary in enumerate(summaries):
                    total_reference_count = 0
                    total_summary_count = 0
                    total_intersection = 0

                    summary_ngrams = self._count_ngrams(summary, n)
                    for reference_ngrams in reference_ngrams_list:
                        reference_total, summary_total, intersection = self._calculate_intersection(
                            reference_ngrams, summary_ngrams)

                        total_reference_count += reference_total
                        total_summary_count += summary_total
                        total_intersection += intersection

                    precision, recall, f1 = self._calculate_pr_f1(
                        total_reference_count, total_summary_count,
                        total_intersection)
                    metrics_list[i][f'python-rouge-{n}'] = {
                        'precision': precision,
                        'recall': recall,
                        'f1': f1,
                    }

            if self.compute_rouge_l:
                for i, summary in enumerate(summaries):
                    precision, recall, f1 = self._calculate_rouge_l(
                        references, summary)
                    metrics_list[i]['python-rouge-l'] = {
                        'precision': precision,
                        'recall': recall,
                        'f1': f1
                    }

            metrics_lists.append(metrics_list)
        return metrics_lists
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
stemmer_output = PorterStemmer()
print(stemmer_output.stem('happiness'))
lemmatizer_output = WordNetLemmatizer()
print(lemmatizer_output.lemmatize('happiness'))
def stemming_text(tokenized_text):
    ps = PorterStemmer()
    stemmed_words = []
    for word in tokenized_text:
        stemmed_words.append(ps.stem(word))
    return stemmed_words
示例#44
0
        tweets_clean.append(word)

print('removed stop words and punctuation:')
print(tweets_clean)

print('\033[92m')
print(tweets_clean)
print('\033[94m')

# Instantiate stemming class
stemmer = PorterStemmer()

tweets_stem = []

for word in tweets_clean:
    stem_word = stemmer.stem(word)
    tweets_stem.append(stem_word)

print('stemmed words:')
print(tweets_stem)


def process_tweet(tweet):
    """Process tweet function.
    Args:
        tweet: a string containing a tweet.
    Returns:
        tweets_clean: a list of words containing the processed tweet.

    """
    stemmer = PorterStemmer()
示例#45
0
def run_cleaner0():
    party_aff = [('obama', 'D'), ('clinton', 'D'), ('bush', 'R'),
                 ('gwbush', 'R')]

    file_nms_rows = session.execute("SELECT DISTINCT filename FROM iwords.raw")
    file_nms0 = []
    for row in file_nms_rows:
        file_nms0.append(row.filename)

    file_nms = np.array(file_nms0)

    for fnm in file_nms:
        sub_stmt = """SELECT * FROM iwords.raw WHERE filename = '{}';""".format(
            fnm)
        #print('sub')
        #print(sub_stmt)
        tmp_sub = session.execute(sub_stmt)
        #print('sub executed')
        # pull each row and organize data
        data = []
        for row in tmp_sub:
            tmp = {
                'filename': row.filename,
                'line_num': row.line_num,
                'doc_num': row.doc_num,
                'pres': row.pres,
                'speech_title': row.speech_title,
                'speech_dt': row.speech_dt,
                'in_office': row.in_office,
                'text': row.text
            }
            data.append(tmp)

        # clean data
        df0 = pd.DataFrame(data)
        #print(row.speech_dt)
        time_input = df0['speech_dt'][0]
        if time_input != None:
            talk_time = pendulum.parse(str(time_input))
            pres = df0['pres'][0]
            title = df0['speech_title'][0]
            prty = [p for n, p in party_aff if n == pres][0]
            io_val = df0['in_office'][0]
            for txt_str in df0['text']:
                sw_en = stopwords.words('english')
                stemming = PorterStemmer()
                pattern = "\w+"
                #\w+(?:'\w+)?|[^\w\s]
                #arr = nltk.word_tokenize(df0["text"][0])
                #list(map(lambda x: nltk.pos_tag(x, tagset='universal', lang='eng'), arr))
                arr0 = regexp_tokenize(txt_str, pattern)
                arr1 = map(lambda x: stemming.stem(x), arr0)
                arr2 = [word for word in arr1 if word not in sw_en]
                pos0 = nltk.pos_tag(arr2, tagset='universal', lang='eng')
                for word, pos in pos0:
                    input_time = str(talk_time)
                    if (len(input_time) > 30):
                        input_time = str(talk_time)[0:23] + "+00:00"
                    else:
                        input_time = talk_time
                    stmt = """INSERT INTO iwords.clean0 (filename, talk_time, pres, party, in_office, word, pos) VALUES ('{}', '{}', '{}', '{}', {}, '{}', '{}');""".format(
                        fnm, input_time, pres, prty, io_val, word, pos)
                    #print(stmt)
                    session.execute(stmt)
                    talk_time = talk_time.add(seconds=0.26)
示例#46
0
# example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

# tokenizing
print(sent_tokenize(example_sent))
print("----------------hasil tokenizing--------------")
print(word_tokenize(example_sent))

# filtering
print("----------------hasil filtering--------------")
# print(word_tokens)
print(filtered_sentence)

print("----------------hasil stemming--------------")
# stemming
words = word_tokenize(example_sent)
for w in filtered_sentence:
    print(ps.stem(w))
    for docid, termid in forward_index.keys():
        if doc_id == str(docid):
            distinct_terms += 1
            total_terms += len(forward_index[(docid, termid)])
            i += 1
    return total_terms


load_term_ids()
load_doc_ids()

if (len(sys.argv) == 3):
    if (sys.argv[1] == '--term'):
        term = sys.argv[2]
        stemmer = PorterStemmer()
        stemmed = stemmer.stem(term)
        if termids.has_key(stemmed):
            load_term_info()
            term_id = termids[stemmed]
            print 'Listing for term: ' + term
            show_term_info(term_id)
        else:
            print "List for " + term + " not present."
    elif (sys.argv[1] == '--doc'):
        doc_name = sys.argv[2]
        if docids.has_key(doc_name):
            load_doc_info()
            doc_id = docids[doc_name]
            print 'Listing for doc: ' + doc_name
            show_doc_info(doc_id)
        else:
示例#48
0
def get_stem(word):
    ps = PorterStemmer()
    return ps.stem(word)
示例#49
0
import nltk
from nltk.stem import PorterStemmer

paragraph = "John does his work intelligently. John is an intelligent man. John  is always working"
sentences = nltk.word_tokenize(paragraph)
stemmer = PorterStemmer()  #Create Object
print sentences
for w in sentences:
    print "Actual %s || Stem: %s", w, stemmer.stem(w)
takeOutStopWords = []
for txt in tokenized:
    temp=[]
    for word in txt:
        if word.lower() not in stop_words:
            temp.append(word)
    takeOutStopWords.append(temp)

    
#apply stemming techniques to find the words’ roots 
#PorterStemmer    
PorterStemmer=[]
sentence=[]
for txt in takeOutStopWords:
    for word in txt:
        sentence.append(ps.stem(word))
    PorterStemmer.append(sentence)

#SnowballStemmer
SnowballStemmer=[]
sentence=[]
for txt in takeOutStopWords:
    for word in txt:
        sentence.append(ss.stem(word))
    SnowballStemmer.append(sentence)


#list of all words
words=[]
for txt in PorterStemmer:
    for word in txt:
示例#51
0
corpus = " ".join(new_doc)

import re

corpus = re.sub("[^A-Za-z ]+", "", corpus)
corpus = corpus.lower()
stop_words2 = ["rt", "the", "today", "we", "i", "so", "space"]
corpus = [w for w in corpus.split(" ") if not w in stop_words2]

corpus = " ".join(corpus)

from nltk.stem import PorterStemmer

lst = PorterStemmer()

corpus = [lst.stem(w) for w in corpus.split(" ")]

corpus = " ".join(corpus)

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
count_vect = vect.fit_transform(corpus.split(" "))
count_vect.shape
names = vect.get_feature_names()

report = pd.DataFrame(count_vect.toarray(), columns=names)

file = {}
for i in report.columns:
    file[i] = report[i].sum()
示例#52
0
print()

print("Stemmers")
ps = PorterStemmer()
ls = LancasterStemmer()
ss = SnowballStemmer("english")

print("Languages Supported By Snowball Stemmer")
[print(x) for x in ss.languages]
print()

print(stem_format.format('Input', *stemmers))
print(stem_format.format('=' * 10, '=' * 16, '=' * 16, '=' * 16))
# [print(stem_format.format(x,ps.stem(x),ls.stem(x),ss.stem(x))) for x in wpt if len(x) > 1]
[
    print(stem_format.format(x, ps.stem(x), ls.stem(x), ss.stem(x)))
    for x in words
]
print()

print("Stopwords Finder")
stop_words = set(stopwords.words('english'))
[print(x) for x in wpt if x in stop_words]
print()

print("Lemmatizers")
wnl = WordNetLemmatizer()

print(lemma_format.format('Input', *lemmatizers))
print(lemma_format.format('=' * 10, '=' * 25, '=' * 25, '=' * 25))
# [print(lemma_format.format(x,wnl.lemmatize(x),wnl.lemmatize(x,pos="v"))) for x in wpt if len(x) > 1]
示例#53
0
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
response = http.request(
    'GET',
    "https://timesofindia.indiatimes.com/india/rbi-governor-met-pm-modi-fm-jaitley-last-week-in-bid-to-heal-rift/articleshow/66597760.cms"
)
soup = BeautifulSoup(response.data, 'html.parser')
Text = ". ".join([p.text for p in soup.find_all('div', {'class': 'Normal'})])
print(Text)
psm = PorterStemmer()
st = LancasterStemmer()
lmtzr = WordNetLemmatizer()
paras_stemmedSansStopWords = [
    psm.stem(word.lower()) for word in word_tokenize(Text)
    if word.lower() not in stopwords.words('english')
]
print(paras_stemmedSansStopWords)

stemmed_para = " ".join([word for word in paras_stemmedSansStopWords])
stemmed_para

custom = set(
    stopwords.words('english') + list(punctuation) + ["'", '"', "“", '’'])
stemmed_words = [
    word for word in paras_stemmedSansStopWords if word not in custom
]
print(stemmed_words)
print(stopwords.words('english'))
from collections import Counter
示例#54
0
stop_text = []
tokens = word_tokenize(text)
for i in tokens:
	if i not in download_stopwords and i not in punktuation:
		stop_text.append(i)


tokens = word_tokenize(text)  #разбиение на слова
tok_sent = sent_tokenize(text) #разбиение на предложения


stemsPorter = []        #Стеммер Портера
porter = PorterStemmer()
for w in tokens:
    a = w
    w = porter.stem(w)
    if w != "":
        stemsPorter.append(w)

stems = []
stemmer = SnowballStemmer("russian")     #Стеммер Snowball
for token in tokens:
    token = stemmer.stem(token)
    if token != "" and token not in punktuation:
        stems.append(token)
result=[]
text_split=text.split(" ")
for i in range (len(text_split)):
    result.append(text_split[i])
    if stems[i] not in punktuation:
        result.append(stems[i])
示例#55
0
    def my_fun(self):

        ds1 = pd.read_csv(self.train_path)
        train_labels = ds1.iloc[:, 2]

        train_row = ds1.shape[0]

        ds2 = pd.read_csv(self.test_path)
        test_row = ds1.shape[0]

        corpus1 = ds1.iloc[:, 1].to_numpy()
        corpus2 = ds2.iloc[:, 1].to_numpy()

        corpus = np.concatenate([corpus1, corpus2], axis=0)

        #    print(corpus.shape)

        for i in range(len(corpus)):

            regex = re.compile('[^a-zA-Z]')
            corpus[i] = regex.sub(' ', corpus[i])
            txt = ''.join(corpus[i])
            corpus[i] = txt

        for i in range(len(corpus)):

            txt1 = corpus[i].split(' ')
            txt = ""

            for j in txt1:
                if (len(j) > 3):
                    txt += " " + j
            corpus[i] = txt

        #    clf = svm.SVC(kernel='linear',C=1)
#    clf.fit(train_data, train_labels)
#    prediction=clf.predict(test_data)
#    return prediction

        for i in range(len(corpus)):

            stemmer = PorterStemmer()

            txt1 = word_tokenize(corpus[i])
            txt = ""
            for word in txt1:
                txt += " " + stemmer.stem(word)

            corpus[i] = txt
        corpus

        my_stop_words = text.ENGLISH_STOP_WORDS

        vectorizer = TfidfVectorizer(stop_words=my_stop_words)
        X = vectorizer.fit_transform(corpus)
        X = X.toarray()

        #    print(X,X.shape)

        #from sklearn.decomposition import PCA
        #pca = PCA(n_components=1000)
        #X=pca.fit_transform(X)

        train_data = X[:train_row]
        test_data = X[train_row:]

        #
        #
        #    print("Train data shape:- ",train_data.shape)
        #    print("Train labels shape:- ",test_data.shape)
        #

        clf = svm.SVC(kernel='linear', C=1)
        clf.fit(train_data, train_labels)
        prediction = clf.predict(test_data)
        return prediction
#To remove the axis value :
plt.axis("off")
plt.show()

#Stemming Example :

#Import stemming library :
from nltk.stem import PorterStemmer

porter = PorterStemmer()

#Word-list for stemming :
word_list = ["Study", "Studying", "Studies", "Studied"]

for w in word_list:
    print(porter.stem(w))

#Stemming Example :

#Import stemming library :
from nltk.stem import SnowballStemmer

snowball = SnowballStemmer("english")

#Word-list for stemming :
word_list = ["Study", "Studying", "Studies", "Studied"]

for w in word_list:
    print(snowball.stem(w))

#Stemming Example :
示例#57
0
    def stemming_tweets(self, tweet):
        ps = PorterStemmer()

        tweets_stemming = ps.stem(tweet)

        return tweets_stemming
        if elem in m:
            return elem
    return 'no problemo'


# In[96]:


data['final_Combined'] = data['final_Combined'].apply(strip_space)


# In[97]:


ps = PorterStemmer()
data['final_Combined']= data['final_Combined'].apply(lambda x: [ps.stem(elem) for elem in x] )
flag_words = [ps.stem(elem) for elem in flag_words]


# In[98]:


data['type']=data['final_Combined'].apply(check_flag_words)
data = data[data['type']!='no problemo']
# data.groupby('type').size().plot.bar()


# In[100]:


data.groupby('score').size()
示例#59
0
def stemData(word):
    ps = PorterStemmer()
    word = ps.stem(word)
    return word
示例#60
0
class SearchEngine:

    def __init__(self, file):
        self.file = file
        self.stopwords = set(stopwords.words('english'))
        self.index = defaultdict(lambda: defaultdict(int))
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.stemmer = PorterStemmer()
        self.totalDocs = 0
        self.results = defaultdict(float)

    def get_files(self):
        try:
            with open(self.file,encoding='utf-8') as json_file:
                data = json.load(json_file)
                for key in data:
                    if key == "39/373":
                        continue
                    if len(data[key]) <= 300:
                        print(key)
                        self.find_text(key,data[key])
        except Exception as e:
            print(e)

    def create_tokens(self,words):
        tokens = []
        for word in words:
            word = word.lower()
            if word not in self.stopwords and len(word) <= 40 and len(word) > 1 and (re.match("^[a-z]+$",word) or re.match("^[0-9]+$",word)):
                tokens.append(self.stemmer.stem(word))
        return tokens

    def find_text(self,path,url):
        frequencies = defaultdict(int)
        self.totalDocs += 1
        soup = BeautifulSoup(open("WEBPAGES_RAW/"+path), "lxml")
        headers = [] #tokenized list
        for headerWords in soup.find_all(['h1','h2','h3','b','strong']):
            headers += self.create_tokens(self.tokenizer.tokenize(headerWords.text))


        content = [] #TOTAL VISIBLE CONTENT ON PAGE, tokenized list
        for b in soup.find_all('body'):
            content += self.create_tokens(self.tokenizer.tokenize(b.text))


        #adding to index...
        #form is word: url: tfidf*weight

        for c in content:
            if c in headers: #more weight
                self.index[c][url] = 1.5
            else:
                self.index[c][url] = 1.0
            frequencies[c] += 1



        #adding term frequency
        total = len(content)
        for (word,frequency) in frequencies.items():
            entry = self.index[word][url]
            self.index[word][url] = (frequency / total) * entry



    def query(self,query):
        query = self.create_tokens(self.tokenizer.tokenize(query))
        return query

    def insertIDF(self):
        for (word, urls) in self.index.items():
            for (url, docinfo) in urls.items():
                entry = docinfo
                tfidf = entry * math.log10((self.totalDocs + .001)/(len(urls) + .001)) #accounts for dividing by 0
                self.index[word][url] = tfidf

    def insertDB(self):
        for x in self.index:
            collection.insert({"token": str(x), "value": self.index[x]}, check_keys=False)


    def run(self):
        self.get_files() #run on all files in corpus here
        self.insertIDF()
        print("length: ", len(self.index))
        print("numbers of documents: ", self.totalDocs)
        collection.remove()  # Clears the existing database before creation
        self.insertDB()  # Inserts index in to db

    def search(self,query):
        #while True:             #Retrieves Query from DB
        self.results = defaultdict(float)
        #tempInput = str(input("Input Query: "))
        tempInput = query
        if tempInput == "quit":
            #break
            return
        query = self.query(tempInput)
        if len(query) > 1:
            for q in query:
                temp = {"token": q}
                found = collection.find_one(temp)
                try:
                    values = found["value"]
                    for v,k in values.items(): #v is url, k is tfidf
                        if v not in self.results:
                            self.results[v] = k
                        else: #multiple words in same document
                            self.results[v] += k
                except Exception as e:
                    continue
        else:
            temp = {"token": query[0]}
            found = collection.find_one(temp)
            try:
                values = found["value"]
                self.results.update(values)
            except Exception as e:
                #continue
                return


        '''print("Showing 20 results out of ", len(self.results))