def get_cooc(chunk_trees,stoplist=True): triples, simple_trees = [], [] lmtzr = WordNetLemmatizer() for t in chunk_trees: entities = [] for chunk in t[:]: if isinstance(chunk,Tree) and chunk.node == 'NP': # getting a tree for later processing of triples from the simple noun # phrases (if present) simple_trees.append(parser_smp.parse(chunk.leaves())) words = [] for word, tag in chunk[:]: # stem/discard elements and construct an argument if (stoplist and word in STOPLIST) or \ (len([x for x in word if x.isalnum()]) == 0): # do not process stopwords for simple trees, do not process purely # non alphanumeric characters continue if tag.startswith('N'): words.append(lmtzr.lemmatize(word,'n')) elif tag.startswith('J'): words.append(lmtzr.lemmatize(word,'a')) else: words.append(word) if len(words) > 0: entities.append(SEP.join(words)) for e1, e2 in combinations(entities,2): triples.append((e1,util.COOC_RELNAME,e2)) triples.append((e2,util.COOC_RELNAME,e1)) return triples, simple_trees
def MakeLemmaList(tagged): # n noun # v verb # a adje # r adverb # m,w,.. something else noun_op, adj_op, adv_op, verb_op, other_op = [], [], [], [], [] lm = WordNetLemmatizer() for i in tagged: # print i, i[0], i[1][0:2] if cmp(i[1][0:1], "N") == 0: noun_op.append(lm.lemmatize(i[0], "n")) elif cmp(i[1][0:1], "V") == 0: asd = lm.lemmatize(i[0], "v") if asd != "be" and asd != "have" and asd != "do" and asd != "done" and asd != "should": verb_op.append(asd) elif cmp(i[1][0:1], "J") == 0: adj_op.append(lm.lemmatize(i[0], "a")) elif cmp(i[1][0:1], "R") == 0: adv_op.append(lm.lemmatize(i[0], "r")) else: # print lm.lemmatize(i[0])+ " " pass final_op = noun_op + verb_op + other_op + adj_op + adv_op return final_op
def decompose(text, keepOriginal): if text: # Case-folding text = text.lower(); # Expand all contractions like "isn't" to "is not" text = expandContractions(text); # Remove punctuation regex = re.compile('[%s]' % re.escape(string.punctuation)) text = regex.sub('', text) # Remove stop words (just add words to the list you think also have to be removed) stopWords = ['the','this','that','those','these','to','as','there','has','and','or', 'is','not','a','an','of','but','in','by','on','are','it','if']; words = text.split(); text = ' '.join([i for i in words if i not in stopWords]); # Lemmatization lemmatizer = WordNetLemmatizer(); words = text.split(); if keepOriginal: text = ' '.join([i + " " + lemmatizer.lemmatize(i) for i in words]); else: text = ' '.join([lemmatizer.lemmatize(i) for i in words]); # Remove duplicate words text = ' '.join(OrderedDict((word,word) for word in text.split()).keys()); return text
def stemming(): lmtzr = WordNetLemmatizer() with open('date_gone.out', 'rb') as fin: with open('stemmed.out', 'w') as fout: i = 0 for line in fin: #i+=1 new_data = [] row = line.split('\t') #print(i) l = len(row) if l > 5: data = row[5] words = data.split(' ') for word in words: new_word = lmtzr.lemmatize(word) new_data.append(new_word) row[5] = ' '.join(new_data) if l > 6: data = row[6] words = data.split(' ') for word in words: new_word = lmtzr.lemmatize(word) new_data.append(new_word) row[6] = ' '.join(new_data) fout.write('\t'.join(row))
def firstDef(mwe,definition): # this is the approach of using only the first definition if definition=='': return([1,1]) definition = definition.split('\n')[0] definition = definition.replace(mwe,'') definition = definition.replace('(','') definition = definition.replace(')','') tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') defineArr = tokenizer.tokenize(definition) lmtzr = WordNetLemmatizer() for i in range(0,len(defineArr)): defineArr[i] = lmtzr.lemmatize(defineArr[i]) words = mwe.split() for i in range(0,len(words)): words[i] = lmtzr.lemmatize(words[i]) if words[0] in defineArr and words[1] in defineArr: return([1,1]) elif words[0] in defineArr: return([1,0]) elif words[1] in defineArr: return([0,1]) else: return([0,0])
def convert_speeches_into_matrix(features,speech_list,label): sample_matrix = [] label_vector = [] #print len(features) for speech in speech_list: sample = [] speech = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speech) speech = re.sub('%[0-9|.]*', ' ', speech) speech = re.sub('$[0-9|.]*',' ', speech) for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ": speech = speech.replace(ch,' ') tokens = speech.split() #word lemmatization lmtzr = WordNetLemmatizer() tokens = [lmtzr.lemmatize(token) for token in tokens] tokens = [lmtzr.lemmatize(token,'v') for token in tokens] #tokens = bigrams(tokens) # uncomment this line, we can use bigram as unique_tokens_dict = collections.Counter(tokens) for fea in features: if fea in unique_tokens_dict: sample.append(unique_tokens_dict[fea]) else: sample.append(0) #print(sample) sample_matrix.append(sample) label_vector.append(label) return sample_matrix,label_vector
def parseLine(line, stopWords_, wordInd, currWrd): """ Removes stop words and lemmas using nltk and punctuations using re. Returns a list with valid words in the line. currWrd is the index of next word occurring for the first time """ lineWords = [] # Hypen in hyphenated words are removed e.g. wi-fi ==> wifi. line = re.sub('(\w)-(\w)',r'\1\2',line) # replace underscore with space line = re.sub('(\w)_(\w)',r'\1 \2',line) # Remove punctuation marks. line = re.sub("[',~`@#$%^&*|<>{}[\]\\\/.:;?!\(\)_+\"-]",r'',line) wnLmtzr = WordNetLemmatizer() for word in line.split(): # Get index of word from wordInd. If it is seen for the first # time assign an index to the word. word = word.lower() # case of words is ignored # Lemmatize word using word net function word = wnLmtzr.lemmatize(word, 'n') # with noun word1 = wnLmtzr.lemmatize(word, 'v') # with verb if len(word1) < len(word): # select smaller of two word = word1 # Ignore stop words and numbers. if word in stopWords_ or \ re.match('^\d+x?\d*$',word) is not None: continue # Update wordInd with number of occurrences of word. if word not in wordInd: wordInd[word] = currWrd[0] currWrd[0] += 1 # Update lineWords with word. lineWords.append(word) return lineWords
def getpurpose(matched,classname): lmtzr = WordNetLemmatizer() if classname=='class4' or classname=='class6' or classname=='class3': exp='\w*?ing NN\w*?' match=re.search(exp,matched) purpose_text=match.group().split() purpose=lmtzr.lemmatize(purpose_text[0],'v') return purpose if classname=='class2': exp='\w*? VB\w*?' match=re.search(exp,matched) purpose_text=match.group().split() purpose=lmtzr.lemmatize(purpose_text[0],'v') return purpose if classname=='class5' or classname=='class7': exp='for IN \w*? NN\w*?'; match=re.search(exp,matched) purpose_text=match.group().split() purpose=lmtzr.lemmatize(purpose_text[2],'v') return purpose if classname=='class1' or 'class9': exp='\w*? IN \w*? VBG' match=re.search(exp,matched) if match: purpose_text=match.group().split() purpose=lmtzr.lemmatize(purpose_text[2],'v') return purpose if classname=='class1': exp='\w*? TO \w*? VB\w*? \w*? NN\w*?' match=re.search(exp,matched) if match: purpose_text=match.group().split() purpose=lmtzr.lemmatize(purpose_text[2],'v') return purpose return None
def data_preprocessing(file_path): f = open(file_path,'r') speech_list = f.read().split("###") # read speeches, split with ###, and save them into list. del speech_list[-1] f.close() #print len(speech_list) f = open(file_path,'r') speeches = f.read().lower() #set all letters lower case speeches = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speeches) speeches = re.sub('%[0-9|.]*', ' ', speeches) speeches = re.sub('$[0-9|.]*',' ', speeches) #speeches = re.sub('\\\\xe2\\\\x80\\\\x[a-zA-Z0-9]*',' ',speeches) #print speeches for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ": speeches = speeches.replace(ch,' ') tokens = speeches.split() #word lemmatization lmtzr = WordNetLemmatizer() tokens = [lmtzr.lemmatize(token) for token in tokens] tokens = [lmtzr.lemmatize(token,'v') for token in tokens] #tokens = bigrams(tokens) # uncomment this line, we can use bigram as total_tokens_count = len(tokens) unique_tokens_dict = collections.Counter(tokens) #key is word, value is the count, #also default value 0 for non-exsit key. result = [ speech_list, unique_tokens_dict, total_tokens_count ] return result
def stemWordMatch(question,sentence): lmtzr = WordNetLemmatizer() question_tokens = set(nltk.word_tokenize(question)) sentence_tokens=set(nltk.word_tokenize(sentence)) count=0 '''for i in sentence_tokens: #Finding the exact word match if lmtzr.lemmatize(i, 'v').lower() in [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]: #print 'matching word is:',i count=count+6 elif i.lower() in [x.lower() for x in question_tokens]: print 'i is :',i count=count+3 #print 'Exact word match count is :',count''' for i in sentence_tokens: #Finding the exact word match if i.lower() in [x.lower() for x in question_tokens]: #print 'i is :',i count=count+3 elif lmtzr.lemmatize(i, 'v').lower() in [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]: #print 'matching word is:',i count=count+6 #print 'Exact word match count is :',count return count
def getlemmas(tokens): lemmas = [] for token in tokens: if len(token) < 2 or not isWord(token) or token == "the": lemmas.append({}) continue tokenLemmas = {} #Synonyms for syn in wn.synsets(token): #Derived Forms and their Syns for lemma in syn.lemmas(): for df in lemma.derivationally_related_forms(): for ln in df.synset().lemma_names(): tokenLemmas[ln] = 4 tokenLemmas[df.name()] = 3 for lname in syn.lemma_names(): tokenLemmas[lname] = 2 #Wordnet lemmas l = WordNetLemmatizer() for x in ('v','a','s','r','n'): tmp = l.lemmatize(token, x) tokenLemmas[tmp] = 1 tmp = l.lemmatize(tmp, x) tokenLemmas[tmp] = 1 #Exact tokenLemmas[token] = 1 lemmas.append(tokenLemmas) return lemmas
def lemma_tokenize(paragraph): lmtzr = WordNetLemmatizer() try: return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence] except LookupError: nltk.download('wordnet') return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
def extract_cooking_methods(input_steps, title): steps = copy.deepcopy(input_steps) steps.append(title) tk_steps = [pos_tag(word_tokenize(w.lower())) for w in steps] methods = [] for step in tk_steps: # methods += [wordnet_lemmatizer.lemmatize(w, pos='v').encode('ascii', 'ignore') for (w, pos) in step if 'VB' in pos] methods += [w.encode('ascii', 'ignore') for (w, pos) in step if 'VB' in pos] for step in steps: if 'preheat' in step: methods += ['preheat', 'preheating'] if 'microwav' in step: methods += ['microwave', 'microwaving'] if 'place' in step: methods.append('place') if 'form' in step: methods.append('form') if 'sprinkle' in step: methods.append('sprinkle') wordnet_lemmatizer = WordNetLemmatizer() discard = ['be', 'use', 'need', 'should', 'allow', 'pink', 'turn', 'reserve'] methods = [m for m in methods if wordnet_lemmatizer.lemmatize(m, pos='v') not in discard and len(m) > 2] stems = [wordnet_lemmatizer.lemmatize(w, pos='v') for w in methods] gerunds = [w[:-1] + 'ing' for w in stems if w[-1] == 'e'] gerunds += [w + 'ing' for w in stems if w[-1] != 'e'] methods = list(set(methods + stems + gerunds)) return methods
class LexicalBigramUnigramAnalyzer(object): def __init__(self): self.lemmatizer = WordNetLemmatizer() self.tb = Blobber(pos_tagger=PerceptronTagger()) self.sentencer = SentenceTokenizer() def __call__(self, doc): tokens = [] for sent in self.sentencer.tokenize(doc.decode('ascii','ignore')): tagged = self.tb(sent.lower()).tags tagged = [(t[0], penn_to_wn(t[1])) for t in tagged] tagged = [(t[0], t[1]) for t in tagged if t[0] not in stopwords.words('english')] ng = zip(tagged, tagged[1:]) rule1 = [(t[0],t[1]) for t in ng if t[0][1]== wn.ADJ and t[1][1]== wn.NOUN] rule2 = [(t[0],t[1]) for t in ng if (t[0][1]== wn.ADV and t[1][1]== wn.VERB) or (t[0][1]== wn.VERB and t[1][1]== wn.ADV)] rule3 = [(t[0],t[1]) for t in ng if t[0][1]== wn.VERB and t[1][1]== wn.VERB] rule4 = [(t[0],t[1]) for t in ng if t[0][1]== wn.NOUN and t[1][1]== wn.NOUN] filtered_list = rule1 + rule2 + rule3 + rule4 # Lemmatize filtered_bigrams = [self.lemmatizer.lemmatize(t[0][0], t[0][1]) + ' ' + self.lemmatizer.lemmatize(t[1][0], t[1][1]) for t in filtered_list] filtered_unigrams = [self.lemmatizer.lemmatize(w[0], w[1]) for w in tagged] for bigram in filtered_bigrams: tokens.append(bigram) for unigram in filtered_unigrams: tokens.append(unigram) return tokens
def single_master_list(data): my_vocab = deepcopy(init_to_zero_vocab) data = data.lower() data = re.sub("\S+@\S", " EMAILREPLACED ", data) data = re.sub("\d+", " NUMBERREPLACED ", data) data = re.sub("\s?http:s?\/\/\w{0,3}\.\w+\.\w{0,3}\S?|w{0,3}\.\w+\.\w{0,3}\S?", " URLREPLACED ", data) for punct in string.punctuation: data = data.replace(punct," ") format_data = data.split() no_stop_words = [] l = WordNetLemmatizer() for word in format_data: if (stop): if word not in stopwords.words('english'): if (lem): no_stop_words.append(l.lemmatize(word)) else: no_stop_words.append(word) else: if (lem): no_stop_words.append(l.lemmatize(word)) else: no_stop_words.append(word) for element in no_stop_words: if(element in my_vocab): my_vocab[element] += 1 return my_vocab
def lemmatize(w,p): if p.startswith("N"): return (wnl.lemmatize(wnl,w,'n'),p) elif p.startswith("V"): return (wnl.lemmatize(wnl,w,'v'),p) else: return (w,p)
def get_dante_answers(senseval_data): # TODO: implement probability based inference of accuracy, i.e. POS adds prob, colloc adds prob, phrase adds prob # - must find values for probs first. for colloc - adjacency affects it. for phrase - order affects it # Or, just test adjacency, presence of colloc and phrase words in the sentence (test both lemmatized and not) # Methods: Set arbitrary values and adjust manually # Use a learning algorithm to find the best mix of values DanteAPI.initialize() dante = DanteAPI.get_all_word_meanings() print "\nDANTE parsing completed" dante_answers = {} lemmatizer = WordNetLemmatizer() for sentence_data in senseval_data: for phrase in sentence_data["test_phrases"]: word_id, raw_word = phrase["headword"] word = lemmatizer.lemmatize(raw_word) phrase_meaning = _answer_phrase(word, sentence_data, dante) if phrase_meaning is not None: dante_answers[word_id] = phrase_meaning else: dante_answers[word_id] = _answer_word(word, sentence_data, dante) for word_id, raw_word in sentence_data["test_words"].iteritems(): word = lemmatizer.lemmatize(raw_word) dante_answers[word_id] = _answer_word(word, sentence_data, dante) return dante_answers
def getting_sentiment(word,pos): flag = 0 if 'NN' in pos: tag = 'n' elif 'JJ' in pos: tag = 'a' if pos == 'JJS': flag = 1 elif 'VB' in pos: tag = 'v' elif 'RB' in pos: tag = 'r' else: tag = '' stemmer = WordNetLemmatizer() if tag != '': x = stemmer.lemmatize(word,tag) else: x = stemmer.lemmatize(word) try: score = float(score_dic[x]) #* float(m1) except KeyError: if len(swn.senti_synsets(x,tag)) > 0: score = swn.senti_synsets(x,tag)[0].pos_score() * 5 else: score = 100 if flag == 1 and score != -100 and score < 4: score = score + 1 elif flag == 1 and score != -100 and score > -4 and score < 0: score = score - 1 print word + '--->' + str(score) return score
def get_singular_forms_NN(plural_forms, nn): lemmatizer = WordNetLemmatizer() singular_forms = [] for w in set(plural_forms): if lemmatizer.lemmatize(w) in nn: singular_forms.append(lemmatizer.lemmatize(w)) return singular_forms
def lemmatize(text,pos=None): from nltk.stem.wordnet import WordNetLemmatizer global _wnl if not _wnl: _wnl = WordNetLemmatizer() if pos: return _wnl.lemmatize(text,pos) return _wnl.lemmatize(text)
def searchString(self, sentence, search_word): # search sentence for given word, lemmatize everything lemm = WordNetLemmatizer() lem_search = lemm.lemmatize(search_word) for idx,word in enumerate(sentence.split(' ')): if lemm.lemmatize(word).lower() == lem_search.lower(): return idx else: return -1
def l(tags_list): tags_list = ast.literal_eval(tags_list) lmtzr = WordNetLemmatizer() return_tags_list = [] for t in list(tags_list): if get_wordnet_pos(t[1]): return_tags_list.append(lmtzr.lemmatize(t[0],get_wordnet_pos(t[1]))) else: return_tags_list.append(lmtzr.lemmatize(t[0])) return return_tags_list
def lmtz(word): wl = WordNetLemmatizer() words = word.split("/") word = words[0] tag = words[1] if tag.startswith("V"): return wl.lemmatize(word, "v") + "/" + tag else: return wl.lemmatize(word) + "/" + tag
def stemmer_word(text): text1 = nltk.word_tokenize(text) lmtzr = WordNetLemmatizer() tagged = nltk.pos_tag(text1) for element in tagged: if get_wordnet_pos(element[1])!= 0: text = text.replace(element[0],lmtzr.lemmatize(element[0],get_wordnet_pos(element[1]))) else: text = text.replace(element[0],lmtzr.lemmatize(element[0])) return text
def extract_phrases(text): lmtzr = WordNetLemmatizer() token_buffer = [] tokens = word_tokenize(text.lower()) annotation = pos_tag(tokens) for (token, pos) in annotation: if pos in valid_POS: if len(lmtzr.lemmatize(token)) > 1: token_buffer.append(lmtzr.lemmatize(token)) return token_buffer
def process_lemm(newLine): newLine = newLine.split() lmtzr = WordNetLemmatizer() for x in range(0,len(newLine)): newLine[x] = lmtzr.lemmatize(newLine[x]) newLine[x] = lmtzr.lemmatize(newLine[x], "v") newLine = " ".join(newLine) #print newLine return newLine
def lemmatize(article): ''' INPUT: string OUTPUT: lemmatized string Lemmatizes all of the words in an article. ''' lem = WordNetLemmatizer() article_lem = ' '.join([lem.lemmatize(lem.lemmatize(word, pos ='v')) for word in article.split()]) article_lem = ' '.join([lem.lemmatize(lem.lemmatize(word)) for word in article_lem.split()]) return article_lem
def get_singular_sentence(sentence): lmtzr = WordNetLemmatizer() keywords = extract_keywords(sentence.lower()) singular_words = [] for keyword in keywords: if (lmtzr.lemmatize(keyword[0])): singular_words.append(lmtzr.lemmatize(keyword[0])) else: singular_words.append(keyword[0]) return ' '.join(singular_words)
def initializeData(data): # graphics_train = fetch_20newsgroups(subset = dataSet,\ # categories = categories, shuffle = True, random_state = 42) wnl = WordNetLemmatizer() stop_words = text.ENGLISH_STOP_WORDS data = data #List of dicts, each element represents word to number mapping for each document termDictList = [] #Dictionary for each term which stores the number of documents that contains this term termDocCountDict = {} # set of term termSet = set() # list of int, each element represents total number of terms in each tokenlized documment termCountList = [] # get focument frequency for each term for i in range(len(data)): document = data[i].lower() words = set(word_tokenize(document)) for word in words: if word.isalpha(): term = wnl.lemmatize(word) if term not in stop_words: if term not in termDocCountDict: termDocCountDict[term] = 0 termDocCountDict[term] += 1 # get termDict and termSet for i in range(len(data)): termDict = {} termCount = 0 document = data[i].lower() words = word_tokenize(document) for word in words: if word.isalpha(): term = wnl.lemmatize(word) if term not in stop_words: if term in termDocCountDict: if termDocCountDict[term] >= 110 and termDocCountDict[term] <= 11000: termSet.add(term) termCount += 1 # fill in termDict if term not in termDict: termDict[term] = 0 termDict[term] += 1 else: del termDocCountDict[term] termDictList.append(termDict) termCountList.append(termCount) return (termDictList, termCountList, termDocCountDict, termSet)
def get_tokens(words): """returns list of tokens""" wnl = WordNetLemmatizer() for i in range(0, len(words)): words[i] = words[i].lower() words[i] = re.sub(ur"\W", "", words[i], flags=re.U) wnl.lemmatize(words[i]) stpwrd = stopwords.words('english') stpwrd.extend(['m','re','o','d','vs','w','3','2','rt','u','ll','ve']) tokens = [i for i in words if i not in stpwrd] #print tokens return tokens
stopwords.extend(newstoplist) new_list = [] #Tokenization and removal of stopwords for sent1 in process_list: newsent = " ".join(sent1) word_tokens = word_tokenize(newsent) filtered_sentence = [w for w in word_tokens if w not in stopwords] #print(filtered_sentence) new_list.append(filtered_sentence) #introducing lemmatization lemma = WordNetLemmatizer() new_list2 = [] for sent1 in new_list: normalized = " ".join(lemma.lemmatize(word, 'n') for word in sent1) x = normalized.split() y = [s for s in x if len(s) > 2] new_list2.append(y) #Using Bigrams texts = new_list2 phrases = Phrases(new_list2) bigram = Phraser(phrases) texts = [bigram[line] for line in new_list2] # NMF is able to use tf-idf, so using TFIDF no_features = 750 tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features,
def lemmatize_word(word): if len(word) < 4: return word lem = WordNetLemmatizer() return lem.lemmatize(lem.lemmatize(word, "n"), "v")
# Query System while True: query = input("Enter Query\n") if query == "exit": break # Query Preprocessing query = query.lower() query = query.split(' ') # Lemmatization temp = [] for word in name: if word not in stop_words: temp.append(lem.lemmatize(word)) name = temp print(query) # Creating an empty graph for output T = nx.empty_graph(0, create_using=nx.MultiDiGraph()) for q_term in query: if q_term in inverted_index: # Fetch the nodes and edges corresponding to each term candidates = inverted_index[q_term] print('Candidates:', candidates) for term in candidates: # Load the edges with term name, if they exist
termSet = set() termSet = getTerms(termSet, termDict) with open(remappedFile) as readFile: idSet = {line.split('\t')[0] for line in readFile} with open(descFile) as readFile: termDict = { eval(line.split('\t')[0]): eval(line.split('\t')[1]) for line in readFile if eval(line.split('\t')[0]) in idSet } termSet = getTerms(termSet, termDict) remove_words = [] remove_words.append('cell') remove_words.append('neoplasm') remove_words.append('neoplasms') remove_words.append('multiple') [termSet.remove(term) for term in remove_words] if nlpType == 'stem': snow = SnowballStemmer('english') termSet = {snow.stem(term) for term in termSet} elif nlpType == 'lemmatize': lemmatizer = WordNetLemmatizer() termSet = {lemmatizer.lemmatize(term) for term in termSet} with open(keepWordFile, 'w') as writeFile: [writeFile.write(term + '\n') for term in termSet]
class JournalTitleAbbreviationProvider(StashableBase): """Manage resources required to support journal title abbreviation assignment using ISO LTWA abbreviations at: https://www.issn.org/services/online-services/access-to-the-ltwa/ Portions of this module have been adapted from the approach developed in https://github.com/adlpr/iso4.git with the following license: MIT License Copyright (c) 2018 Alex DelPriore Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. """ def __init__(self, **kwargs): dirName = "journal-abbreviations" cachePath = kwargs.get("cachePath", ".") super(JournalTitleAbbreviationProvider, self).__init__(cachePath, [dirName]) urlTargetIsoLtwa = kwargs.get( "urlTargetLtwa", "https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt" ) dirPath = os.path.join(cachePath, dirName) useCache = kwargs.get("useCache", True) # self.__noAbbrevPlaceHolder = "n.a." self.__prefixKey = "prefix" self.__suffixKey = "suffix" self.__infixKey = "infix" self.__fullWordKey = "full" self.__lowercaseFlag = "lower" self.__uppercaseFlag = "upper" self.__titlecaseFlag = "title" # self.__wml = WordNetLemmatizer() # self.__stopWords = set([ "a", "about", "afore", "after", "ago", "along", "amid", "among", "amongst", "an", "and", "apropos", "as", "at", "atop", "but", "by", "ca", "circa", "for", "from", "hence", "in", "into", "like", "nor", "of", "off", "on", "onto", "ontop", "or", "out", "over", "per", "since", "so", "than", "the", "though", "til", "till", "to", "unlike", "until", "unto", "up", "upon", "upside", "versus", "via", "vis-a-vis", "vs", "when", "whenever", "where", "whereas", "wherever", "while", "with", "within", "yet", "aus", "des", "der", "für", "im", "und", "zu", "zur", "da", "de", "del", "della", "delle", "di", "do", "e", "el", "en", "et", "i", "la", "le", "lo", "las", "les", "los", "y", "van", "voor", "og", ]) self.__abbrevD, self.__conflictD, self.__multiWordTermList = self.__rebuildCache( urlTargetIsoLtwa, dirPath, useCache) # Token a string space boundaries respecting a special list of multi-word strings - self.__tokenizerRegex = re.compile("({}|\\s+)".format("|".join([ "(?:^|\\s){}(?:\\s|$)".format(w) for w in self.__multiWordTermList ])), flags=re.I) def testCache(self): # Lengths ... try: logger.info("Abbreviation length LTWA %d", len(self.__abbrevD["full"])) if len(self.__abbrevD) == 4 and len( self.__abbrevD["full"]) > 39000 and len( self.__multiWordTermList) > 250: return True except Exception: pass return False def __rebuildCache(self, urlTargetIsoLtwa, dirPath, useCache): """Rebuild the cache of ISO abbreviation term data Args: urlTargetIsoLtwa (str): URL for ISO4 LTWA title word abbreviations dirPath (str): cache path useCache (bool): flag to use cached files Returns: tuple: (dict) title word abbreviations (dict) language conflict dictionary (list) multi-word abbreviation targets Notes: ISO source file (tab delimited UTF-16LE) is maintained at the ISSN site - https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt """ aD = {} mU = MarshalUtil(workPath=dirPath) fmt = "json" ext = fmt if fmt == "json" else "pic" isoLtwaNamePath = os.path.join(dirPath, "iso-ltwa.%s" % ext) logger.debug("Using cache data path %s", dirPath) mU.mkdir(dirPath) if not useCache: for fp in [isoLtwaNamePath]: try: os.remove(fp) except Exception: pass # if useCache and mU.exists(isoLtwaNamePath): aD = mU.doImport(isoLtwaNamePath, fmt=fmt) logger.debug("Abbreviation name length %d", len(aD["abbrev"])) elif not useCache: # ------ fU = FileUtil() logger.info("Fetch data from source %s in %s", urlTargetIsoLtwa, dirPath) fp = os.path.join(dirPath, fU.getFileName(urlTargetIsoLtwa)) ok = fU.get(urlTargetIsoLtwa, fp) aD = self.__getLtwaTerms(dirPath, fp) ok = mU.doExport(isoLtwaNamePath, aD, fmt=fmt) logger.debug("abbrevD keys %r", list(aD.keys())) logger.debug("Caching %d ISO LTWA in %s status %r", len(aD["abbrev"]), isoLtwaNamePath, ok) # abbrevD = aD["abbrev"] if "abbrev" in aD else {} conflictD = aD["conflicts"] if "conflicts" in aD else {} multiWordTermL = aD[ "multi_word_abbrev"] if "multi_word_abbrev" in aD else [] # return abbrevD, conflictD, multiWordTermL def getJournalAbbreviation(self, title, usePunctuation=True): # useLangs = ["eng"] title = unicodedata.normalize("NFKD", title) useLangs = set(useLangs) # split title either at space on as defined as multi-word targets titleWords = list( filter(lambda w: w.strip(), self.__tokenizerRegex.split(title))) retWordList = [] # Exception for single-word titles if len(titleWords) == 1 and len(titleWords[0].split(" ")) == 1: return title for origWord in titleWords: # normalize and lemmatize wordNorm = self.__normalizeWord(origWord) # skip stopwords if wordNorm in self.__stopWords: continue # if normalized word fails, try lemma wordLemma = self.__wml.lemmatize(wordNorm) wordCandidates = ( wordNorm, wordLemma) if wordNorm != wordLemma else (wordNorm, ) wordAbbr = "" capitalization = self.__getCapitalization(origWord) for word in wordCandidates: # Check for language degeneracy in mapping if self.__fullWordKey in self.__conflictD and word in self.__conflictD[ self.__fullWordKey]: allowedLangs = self.__conflictD[ self.__fullWordKey][word].keys() possibleLangs = allowedLangs & useLangs if len(possibleLangs) == 1: wordAbbr = self.__conflictD[self.__fullWordKey][word][ possibleLangs.pop()] break else: logger.error( "Language mapping conflict for term %r (%r)", word, allowedLangs) return title if not wordAbbr and self.__prefixKey in self.__conflictD: # prefix conflicts for prefix in sorted( self.__conflictD[self.__prefixKey].keys()): if word.startswith(prefix): allowedLangs = self.__conflictD[ self.__prefixKey][word].keys() possibleLangs = allowedLangs & useLangs if len(possibleLangs) == 1: wordAbbr = self.__conflictD[self.__prefixKey][ word][possibleLangs.pop()] else: logger.error( "Language mapping conflict for term %r (%r)", word, allowedLangs) return title if not wordAbbr and self.__suffixKey in self.__conflictD: # suffix conflicts for suffix in sorted( self.__conflictD[self.__suffixKey].keys()): if word.endswith(suffix): allowedLangs = self.__conflictD[ self.__suffixKey][word].keys() possibleLangs = allowedLangs & useLangs if len(possibleLangs) == 1: wordAbbr = self.__conflictD[self.__suffixKey][ word][possibleLangs.pop()] else: logger.error( "Language mapping conflict for term %r (%r)", word, allowedLangs) return title if not wordAbbr and self.__infixKey in self.__conflictD: # infix conflicts for infix in sorted( self.__conflictD[self.__infixKey].keys()): if infix in word: allowedLangs = self.__conflictD[ self.__infixKey][word].keys() possibleLangs = allowedLangs & useLangs if len(possibleLangs) == 1: wordAbbr = self.__conflictD[ self.__infixKey][word][possibleLangs.pop()] else: logger.error( "Language mapping conflict for term %r (%r)", word, allowedLangs) return title if wordAbbr: break # Evaluate abbreviation mapping for each word type if not wordAbbr and self.__fullWordKey in self.__abbrevD and word in self.__abbrevD[ self.__fullWordKey]: wordAbbr = self.__abbrevD[self.__fullWordKey][word] break if not wordAbbr and self.__prefixKey in self.__abbrevD: # check prefixes in descending length order for prefix in sorted( self.__abbrevD[self.__prefixKey].keys(), key=lambda p: (-len(p), p)): if word.startswith(prefix): wordAbbr = self.__abbrevD[self.__prefixKey][prefix] break if not wordAbbr and self.__suffixKey in self.__abbrevD: # check suffixes in descending length order for suffix in sorted( self.__abbrevD[self.__suffixKey].keys(), key=lambda p: (-len(p), p)): if word.endswith(suffix): wordAbbr = self.__abbrevD[self.__suffixKey][suffix] break if not wordAbbr and self.__infixKey in self.__abbrevD: # check infixes in descending length order for infix in sorted(self.__abbrevD[self.__infixKey].keys(), key=lambda p: (-len(p), p)): if infix in word: wordAbbr = self.__abbrevD[self.__infixKey][infix] break if wordAbbr: break # Apply formating preferences if wordAbbr in ("", self.__noAbbrevPlaceHolder): wordAbbr = self.__finalizeOutput(word, capitalization, usePunctuation=False) else: wordAbbr = self.__finalizeOutput(wordAbbr, capitalization, usePunctuation) retWordList.append(wordAbbr) return unicodedata.normalize("NFKC", " ".join(retWordList)) def __getType(self, word): """Classify the input word base on internal punctuation.""" if word.startswith("-"): return self.__infixKey if word.endswith("-") else self.__suffixKey elif word.endswith("-"): return self.__prefixKey else: return self.__fullWordKey def __getCapitalization(self, word): """Classify case construction of the input term. Args: word (str): Input term to be evaluated Returns: (str): flag indicating case ('upper', 'lower', 'title') """ if word == word.upper(): return self.__uppercaseFlag elif word[0].isupper(): # guess title case if not all upper return self.__titlecaseFlag else: return self.__lowercaseFlag def __normalizeWord(self, word): """Strip hyphens, other punctuation, lower, normalize NFKD.""" parts = [] for part in word.split(" "): part = re.sub(r"(^\-|\p{P}+$)", "", part).strip() parts.append(unicodedata.normalize("NFKD", part.lower())) return " ".join(parts).strip() def __normalizeAbbr(self, abbr): """Strip hyphens, period, lower, normalize NFKD (if not "n.a.").""" if abbr == self.__noAbbrevPlaceHolder: return abbr parts = [] for part in abbr.split(" "): parts.append( unicodedata.normalize("NFKD", part.strip("- ").rstrip(".").lower())) return " ".join(parts) def __finalizeOutput(self, word, capitalization, usePunctuation): """Modify output word according to capitalization and punctuation preferences.""" parts = [] for part in word.split(" "): if capitalization == self.__uppercaseFlag: part = part.upper() elif capitalization == self.__titlecaseFlag: part = string.capwords(part) if usePunctuation: part += "." parts.append(part) return " ".join(parts) def __getLtwaTerms(self, dirPath, isoLtwaNamePath): logger.info("Processing terms in %r", isoLtwaNamePath) titleWordAbbrevD = {} conflictD = {} multiWordTermL = [] abbrevD = { "abbrev": titleWordAbbrevD, "conflicts": conflictD, "multi_word_abbrev": multiWordTermL } # mU = MarshalUtil(workPath=dirPath) try: tsv = mU.doImport(isoLtwaNamePath, fmt="tdd", rowFormat="list", encoding="utf-16-le") logger.debug("Read isoLtwaNamePath %s record count %d", isoLtwaNamePath, len(tsv)) conflictWords = set() for line in tsv: try: if len(line) == 3: word, abbr, langs = line else: word, abbr = line langs = "" except Exception: logger.error("Format issue for line %r", line) continue wType = self.__getType(word) word = self.__normalizeWord(word) abbr = self.__normalizeAbbr(abbr) # Assign word type - if wType not in titleWordAbbrevD: titleWordAbbrevD[wType] = {} # Detect conflict words if word in titleWordAbbrevD[wType]: conflictWords.add((wType, word)) elif " " in word: multiWordTermL.append(re.escape(word)) # titleWordAbbrevD[wType][word] = abbr # Build dictionary capturing degenerate language specific mappings for wType, word in conflictWords: # remove from main list titleWordAbbrevD[wType].pop(word) logger.debug("conflict words length %d", len(conflictWords)) for line in tsv: try: if len(line) == 3: word, abbr, langs = line else: word, abbr = line langs = "" except Exception: logger.error("Format issue for line %r", line) continue wType = self.__getType(word) word = self.__normalizeWord(word) logger.debug("Word %r wordType %r", word, wType) if (wType, word) in conflictWords: abbr = self.__normalizeAbbr(abbr) if wType not in conflictD: conflictD[wType] = {} if word not in conflictD[wType]: conflictD[wType][word] = {} for lang in langs.split(","): conflictD[wType][word][lang.strip()] = abbr multiWordTermL = sorted(list(set(multiWordTermL))) # abbrevD = { "abbrev": titleWordAbbrevD, "conflicts": conflictD, "multi_word_abbrev": multiWordTermL } for ky in abbrevD["abbrev"]: logger.debug("abbreviation type %r length %r", ky, len(abbrevD["abbrev"][ky])) for ky in abbrevD: logger.debug("Content type %r length %r", ky, len(abbrevD[ky])) # except Exception as e: logger.exception("Failing reading %s with %s", isoLtwaNamePath, str(e)) return abbrevD
class SeoKeywords: def __init__(self): self.connect_db() self.stemmer = LancasterStemmer() self.lemmatizer = WordNetLemmatizer() self.noun_dict = {} self.adj_dict = {} #configuration self.punctuation = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~' self.filter_occurrence_threshold = 2 self.sample_rate = 10 def connect_db(self): self.db = MySQLdb.connect(host="localhost", user="******", passwd="!ac-okl.34.731", db="king") self.cursor = self.db.cursor(cursorclass=MySQLdb.cursors.DictCursor) def run(self): #gather product corpus per category category_ids = self.get_category_ids_with_many_products() for category_id in category_ids: products = self.get_products_by_category_id(category_id) pc = 0 for product in products: pc += 1 if pc % self.sample_rate != 0: continue print pc self.process_product(product) print 'product count: ' + str(pc) self.nouns = [ k for k, v in self.noun_dict.iteritems() if v > self.filter_occurrence_threshold ] self.adjectives = [ k for k, v in self.adj_dict.iteritems() if v > self.filter_occurrence_threshold ] print 'nouns' print len(self.nouns) print self.nouns print 'adjectives' print len(self.adjectives) print self.adjectives def process_product(self, product): #1 sentence = self.clean_product_string(product) #2 words = self.tokenize_sentence(sentence) #3 nouns, adjectives = self.extract_nouns_and_adjectives(words) for noun in nouns: self.noun_dict[noun] = self.noun_dict.get(noun, 0) + 1 for adjective in adjectives: self.adj_dict[adjective] = self.adj_dict.get(adjective, 0) + 1 #processing the corpus def clean_product_string(self, product): print 'clean_product_corpus:' sentence = product['name'] #+ '\n' + product['description'] #strip lines #sentence = " ".join(sentence.splitlines()) #strip html sentence = nltk.clean_html(sentence) #sentence = " ".join([str(s) for s in BeautifulSoup(sentence).findAll(text=True)]) #strip punctuations (translate is fastest) sentence = sentence.translate(None, self.punctuation) #TODO:filter more words return sentence def tokenize_sentence(self, sentence): print sentence print '-->' #tokenize, lower case, remove stop words, lemmatize words = [ self.lemmatizer.lemmatize(w.lower()) for w in nltk.word_tokenize(sentence) if w not in stopwords.words('english') ] #words = [self.stemmer.stem(w) for w in words] return words def extract_nouns_and_adjectives(self, words): tags = nltk.pos_tag(words) print tags nouns = set([t[0] for t in tags if t[1] in ['NN']]) adjectives = set([t[0] for t in tags if t[1] in ['JJ']]) return (nouns, adjectives) #this return the categories with more than 10000 products def get_category_ids_with_many_products(self): print 'get_category_ids_with_many_products' return [120401] return [ 100101, 100402, 109999, 110102, 110106, 110804, 119999, 120203, 120401, 130102, 130804, 131503, 139999, 150901, 160501, 170301, 170401, 179999, 190301, 200103 ] query = """SELECT cd.category_id, cd.name, cd.description, count(p2c.product_id) as product_count FROM category c JOIN category_description cd on c.category_id = cd.category_id, product_to_category p2c WHERE c.category_id = p2c.category_id GROUP BY 1 HAVING count(p2c.product_id) > 10000""" self.cursor.execute(query) results = self.cursor.fetchall() category_tuples = [(row['category_id'], row['name']) for row in results] return category_tuples def get_products_by_category_id(self, category_id): print 'get_products_by_category_id:' + str(category_id) query = """SELECT pd.product_id, pd.name, pd.description FROM product_to_category p2c, product_description pd WHERE p2c.product_id = pd.product_id AND p2c.category_id = %s;""" self.cursor.execute(query, category_id) products = self.cursor.fetchall() print 'count:' + str(len(products)) return products def end(self): self.db.close()
label_id = len(labels_index) labels_index[name] = label_id for fname in sorted(os.listdir(path)): fpath = os.path.join(path, fname) if sys.version_info < (3,): f = open(fpath) else: f = open(fpath, encoding='latin-1') t = f.read() t = t.replace('\n', '') t = re.sub(r'[^\w\s]','',t) tokens = word_tokenize(t) #filtered_tokens = [word for word in tokens if word not in stopwords.words('english')] lmtzr = WordNetLemmatizer() lems = [lmtzr.lemmatize(t) for t in tokens] t = " ".join(lems) texts.append(t) f.close() labels.append(label_id) print('Found %s texts.' % len(texts)) # finally, vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index with open("word_index.pkl", "wb") as w_index_file: pickle.dump(word_index, w_index_file)
def Lemmatization(word): return WordNetLemmatizer.lemmatize(word,"v")
def createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): """ outputs the keywords for each user on a particular channel after normalising the frequency and removing the common stop words. Args: log_directory (str): Location of the logs (Assumed to be arranged in directory structure as : <year>/<month>/<day>/<log-file-for-channel>.txt) channel_name (str): Channel to be perform analysis on output_directory (str): Location of output directory startingDate (int): Date to start the analysis (in conjunction with startingMonth) startingMonth (int): Date to start the analysis (in conjunction with startingDate) endingDate (int): Date to end the analysis (in conjunction with endingMonth) endingMonth (int): Date to end the analysis (in conjunction with endingDate) Returns: null """ out_dir_nick_change = output_directory + "key-words/" user_words_dict = [] user_keyword_freq_dict = [] nick_same_list = [ [] for i in range(5000) ] #list of list with each list having all the nicks for that particular person keywords_filtered = [] no_messages = 0 # print "Creating a new output folder" # os.system("rm -rf "+out_dir_nick_change) # os.system("mkdir "+out_dir_nick_change) rem_time = None #remembers the time of the last message of the file parsed before the current file for folderiterator in range(startingMonth, endingMonth + 1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range( startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath = log_directory + temp1 + str( folderiterator) + "/" + temp2 + str( fileiterator) + "/" + channel_name + ".txt" if not os.path.exists(filePath): if not ((folderiterator == 2 and (fileiterator == 29 or fileiterator == 30 or fileiterator == 31)) or ((folderiterator == 4 or folderiterator == 6 or folderiterator == 9 or folderiterator == 11) and fileiterator == 31)): print "[Error] Path " + filePath + " doesn't exist" continue with open(filePath) as f: content = f.readlines( ) #contents stores all the lines of the file channel_name # print "Analysing ",filePath nicks = [] #list of all the nicknames ''' Getting all the nicknames in a list nicks[] ''' for i in content: if (i[0] != '=' and "] <" in i and "> " in i): m = re.search(r"\<(.*?)\>", i) if m.group(0) not in nicks: nicks.append( m.group(0) ) #used regex to get the string between <> and appended it to the nicks list for i in xrange(0, len(nicks)): nicks[i] = nicks[i][1:-1] #removed <> from the nicknames for i in xrange(0, len(nicks)): nicks[i] = ext.util.correctLastCharCR(nicks[i]) for line in content: if ( line[0] == '=' and "changed the topic of" not in line ): #excluding the condition when user changes the topic. Search for only nick changes nick1 = ext.util.correctLastCharCR( line[line.find("=") + 1:line.find(" is")][3:]) nick2 = ext.util.correctLastCharCR( line[line.find("wn as") + 1:line.find("\n")][5:]) if nick1 not in nicks: nicks.append(nick1) if nick2 not in nicks: nicks.append(nick2) #print("printing nicks***********************************") #print(nicks) ''' Forming list of lists for avoiding nickname duplicacy ''' for line in content: if (line[0] == '=' and "changed the topic of" not in line): line1 = line[line.find("=") + 1:line.find(" is")][3:] line2 = line[line.find("wn as") + 1:line.find("\n")][5:] line1 = ext.util.correctLastCharCR(line1) line2 = ext.util.correctLastCharCR(line2) for i in range(5000): if line1 in nick_same_list[ i] or line2 in nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break if not nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break #print("printing nick_same_list****************************") #print(nick_same_list) for line in content: flag_comma = 0 if (line[0] != '=' and "] <" in line and "> " in line): m = re.search(r"\<(.*?)\>", line) var = m.group(0)[1:-1] var = ext.util.correctLastCharCR(var) for d in range(len(nicks)): if var in nick_same_list[d]: nick_sender = nick_same_list[d][0] break else: nick_sender = var nick_receiver = '' for i in nicks: rec_list = [e.strip() for e in line.split(':') ] #receiver list splited about : rec_list[1] = rec_list[1][rec_list[1].find(">") + 1:len(rec_list[1])] rec_list[1] = rec_list[1][1:] if not rec_list[1]: #index 0 will contain time 14:02 break for k in xrange(0, len(rec_list)): if (rec_list[k]): #checking for \ rec_list[k] = ext.util.correctLastCharCR( rec_list[k]) for z in rec_list: if (z == i): if (var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver = nick_same_list[d][ 0] break else: nick_receiver = i if "," in rec_list[ 1]: #receiver list may of the form <Dhruv> Rohan, Ram : flag_comma = 1 rec_list_2 = [ e.strip() for e in rec_list[1].split(',') ] for y in xrange(0, len(rec_list_2)): if (rec_list_2[y]): #checking for \ rec_list_2[y] = ext.util.correctLastCharCR( rec_list_2[y]) for j in rec_list_2: if (j == i): if (var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver = nick_same_list[ d][0] break else: nick_receiver = i if (flag_comma == 0 ): #receiver list can be <Dhruv> Rohan, Hi! rec = line[line.find(">") + 1:line.find(", ")] rec = rec[1:] rec = ext.util.correctLastCharCR(rec) if (rec == i): if (var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver = nick_same_list[d][ 0] break else: nick_receiver = i #generating the words written by the sender message = rec_list[1:] no_messages += 1 correctedNickReciever = correctNickFor_(nick_receiver) if correctedNickReciever in message: message.remove(correctedNickReciever) # print nick_sender, "Message", ":".join(message), "end" lmtzr = WordNetLemmatizer() #limit word size = 3, drop numbers. word_list_temp = re.sub( r'\d+', '', " ".join( re.findall(r'\w{3,}', ":".join(message).replace( ",", " ")))).split(" ") word_list = [] #remove punctuations for word in word_list_temp: word = word.lower() word_list.append(word.replace("'", "")) word_list_lemmatized = [] try: word_list_lemmatized = map( lmtzr.lemmatize, map(lambda x: lmtzr.lemmatize(x, 'v'), word_list)) except UnicodeDecodeError: pass # word_list_lemmatized = [ unicode(s) for s in word_list_lemmatized] # print "=====>original", word_list # print "===>lemmatized", word_list_lemmatized fr = 1 for dic in user_words_dict: if dic['sender'] == nick_sender: # print '1========',word_list_lemmatized dic['words'].extend(word_list_lemmatized) fr = 0 if fr: # print '2========',word_list_lemmatized user_words_dict.append({ 'sender': nick_sender, 'words': word_list_lemmatized }) nicks_for_stop_words = [] stop_word_without_apostrophe = [] for l in nick_same_list: nicks_for_stop_words.extend(l) for dictonary in user_words_dict: nicks_for_stop_words.append(dictonary['sender']) nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words]) for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'", "")) stop_words_extended = text.ENGLISH_STOP_WORDS.union( common_english_words.words).union(nicks_for_stop_words).union( stop_word_without_apostrophe).union(custom_stop_words.words).union( custom_stop_words.slangs) count_vect = CountVectorizer(analyzer='word', stop_words=stop_words_extended, min_df=1) for dictonary in user_words_dict: # print dictonary['sender'] # print dictonary['words'] try: matrix = count_vect.fit_transform(dictonary['words']) freqs = [[word, matrix.getcol(idx).sum()] for word, idx in count_vect.vocabulary_.items()] keywords = sorted(freqs, key=lambda x: -x[1]) # print 'Nick:', dictonary['sender'] total_freq = 0.0 for freq_tuple in keywords: total_freq += freq_tuple[1] # print total_freq for freq_tuple in keywords: freq_tuple.append(round(freq_tuple[1] / float(total_freq), 5)) user_keyword_freq_dict.append({ 'nick': dictonary['sender'], 'keywords': keywords }) # print 'Keywords: (Format : [<word>, <frequency>, <normalised_score>])' # print keywords # print "\n" except ValueError: pass # print user_keyword_freq_dict # print dataForNick(user_keyword_freq_dict, 'BluesKaj', 0.01) for data in user_keyword_freq_dict: keywords, normal_scores = dataForNick(user_keyword_freq_dict, data['nick'], 0.01, 100) # print "Nick:", data['nick'] # print "Keywords with normalised score > 0.01\n", keywords # print "Their Normal scores\n", normal_scores # print "\n" if keywords: keywords_filtered.append({ 'nick': data['nick'], 'keywords': keywords }) # print "KEYWORDS!" # print keywords_filtered # print "DICT" # print user_keyword_freq_dict print str(startingMonth) + "\t" + str(no_messages) + "\t" + str( len(user_words_dict)) return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words
from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem.wordnet import WordNetLemmatizer from nltk.sentiment.vader import SentimentIntensityAnalyzer from wordcloud import WordCloud from textblob import TextBlob data_df['tokenized_tweet'] = data_df['clean_text'].apply( lambda x: word_tokenize(x)) stop_words = set(stopwords.words('english')) data_df['tweet_token_filter'] = data_df['tokenized_tweet'].apply( lambda x: [word for word in x if not word in stop_words]) lemmatizing = WordNetLemmatizer() data_df['tweet_lemmatized'] = data_df['tweet_token_filter'].apply( lambda x: ' '.join([lemmatizing.lemmatize(i) for i in x])) data_df['sentiment_lemmatized'] = data_df['tweet_lemmatized'].apply( lambda x: TextBlob(x).sentiment) data_df[['sentiment_lemmatized', 'tweet_lemmatized']].head(10) all_words = ' '.join([text for text in data_df['tweet_lemmatized']]) wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) plt.figure(figsize=(10, 7)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis('off') plt.title("Most Common words in column Tweet Lemmatized") plt.show()
def keywords(log_dict, nicks, nick_same_list): """ Returns keywods for all users Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : list of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py Returns keywords_filtered: filtered keywords for user user_keyword_freq_dict: dictionary for each user having keywords and their frequency user_words_dict: keywods for user nicks_for_stop_words: stop words """ user_words_dict = [] user_keyword_freq_dict = [] keywords_filtered = [] no_messages = 0 def get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list): if (rec == nick_name): if (nick_to_compare != nick_name): nick_receiver = util.get_nick_representative( nicks, nick_same_list, nick_name) return nick_receiver for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] for line in day_log: flag_comma = 0 if (util.check_if_msg_line(line)): m = re.search(r"\<(.*?)\>", line) nick_to_compare = util.correctLastCharCR( (m.group(0)[1:-1])) nick_sender = '' nick_sender = util.get_nick_representative( nicks, nick_same_list, nick_to_compare) nick_receiver = '' for nick_name in nicks: rec_list = [e.strip() for e in line.split(':') ] #receiver list splited about : rec_list = util.rec_list_splice(rec_list) if not rec_list[1]: #index 0 will contain time 14:02 break rec_list = util.correct_last_char_list(rec_list) for rec in rec_list: nick_receiver = get_nick_receiver( nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) if "," in rec_list[ 1]: #receiver list may of the form <Dhruv> Rohan, Ram : flag_comma = 1 rec_list_2 = [ e.strip() for e in rec_list[1].split(',') ] rec_list_2 = util.correct_last_char_list( rec_list_2) for rec in rec_list_2: nick_receiver = get_nick_receiver( nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) if (flag_comma == 0 ): #receiver list can be <Dhruv> Rohan, Hi! rec = util.splice_find(line, ">", ", ", 1) nick_receiver = get_nick_receiver( nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) #generating the words written by the sender message = rec_list[1:] no_messages += 1 correctedNickReciever = util.correct_nick_for_( nick_receiver) if correctedNickReciever in message: message.remove(correctedNickReciever) lmtzr = WordNetLemmatizer() #limit word size = 3, drop numbers. word_list_temp = re.sub( r'\d+', '', " ".join( re.findall(r'\w{3,}', ":".join(message).replace( ",", " ")))).split(" ") word_list = [] #remove punctuations for word in word_list_temp: word = word.lower() word_list.append(word.replace("'", "")) word_list_lemmatized = [] try: word_list_lemmatized = map( lmtzr.lemmatize, map(lambda x: lmtzr.lemmatize(x, 'v'), word_list)) except UnicodeDecodeError: pass fr = 1 for dic in user_words_dict: if dic['sender'] == nick_sender: dic['words'].extend(word_list_lemmatized) fr = 0 if fr: user_words_dict.append({ 'sender': nick_sender, 'words': word_list_lemmatized }) nicks_for_stop_words = [] stop_word_without_apostrophe = [] for l in nick_same_list: nicks_for_stop_words.extend(l) for dictonary in user_words_dict: nicks_for_stop_words.append(dictonary['sender']) nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words]) for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'", "")) stop_words_extended = extended_stop_words(nicks_for_stop_words, stop_word_without_apostrophe) count_vect = CountVectorizer(analyzer='word', stop_words=stop_words_extended, min_df=1) keywords_for_channels = [] for dictonary in user_words_dict: try: matrix = count_vect.fit_transform(dictonary['words']) freqs = [[word, matrix.getcol(idx).sum()] for word, idx in count_vect.vocabulary_.items()] keywords = sorted(freqs, key=lambda x: -x[1]) total_freq = 0.0 for freq_tuple in keywords: total_freq += freq_tuple[1] for freq_tuple in keywords: freq_tuple.append(round(freq_tuple[1] / float(total_freq), 5)) user_keyword_freq_dict.append({ 'nick': dictonary['sender'], 'keywords': keywords }) keywords_for_channels.extend(keywords) except ValueError: pass for data in user_keyword_freq_dict: keywords, normal_scores = top_keywords_for_nick( user_keyword_freq_dict, data['nick'], config.KEYWORDS_THRESHOLD, config.KEYWORDS_MIN_WORDS) if config.DEBUGGER and config.PRINT_WORDS: print "Nick:", data['nick'] print "Keywords with normalised score > 0.01\n", keywords print "Their Normal scores\n", normal_scores print "\n" if keywords: keywords_filtered.append({ 'nick': data['nick'], 'keywords': keywords }) return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words, sorted( keywords_for_channels, key=lambda x: x[2], reverse=True)
pt = PorterStemmer() from nltk.stem.snowball import EnglishStemmer sb = EnglishStemmer() from nltk.stem.wordnet import WordNetLemmatizer wn = WordNetLemmatizer() ##let's examine the word ``better" st.stem('better') pt.stem('better') sb.stem('better') wn.lemmatize('better', 'a') wn.lemmatize('families', 'n') ## ##applying the porter stemmer to the gettysburg address text_5 = list(map(pt.stem, text_3)) ##now creating a dictionary that will count the occurrence of the words getty = {} used = [] for word in text_5: if word in getty: getty[word] += 1
filtered_sent = [] for w in tokenized_sent: if w not in tokenized_sent: filtered_sent.append(w) print("Tokenized Sentence:", tokenized_sent) print("Filterd Sentence:", filtered_sent) ps = PorterStemmer() stemmed_words = [] for w in filtered_sent: stemmed_words.append(ps.stem(w)) print("Filtered Sentence:", filtered_sent) print("Stemmed Sentence:", stemmed_words) lem = WordNetLemmatizer() stem = PorterStemmer() word = "flying" print("Lemmatized Word:", lem.lemmatize(word, "v")) print("Stemmed Word:", stem.stem(word)) text = """Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard""" print_text_sentence_tokenization(text) sent = "Albert Einstein was born in Ulm, Germany in 1879." tokens = nltk.word_tokenize(sent) print(tokens) print(nltk.pos_tag(tokens))
class LemmaTokenizer(object): def __init__(self): self.wnl = WordNetLemmatizer() def __call__(self, articles): return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
if __name__ == '__main__': driver = QABase() # Get the first question and its story q = driver.get_question("fables-01-1") story = driver.get_story(q["sid"]) # get the dependency graph of the first question qgraph = q["dep"] #print("qgraph:", qgraph) # The answer is in the second sentence # You would have to figure this out like in the chunking demo sgraph = story["sch_dep"][1] # TODO: send in the correct sentence!!!!! lmtzr = WordNetLemmatizer() for node in sgraph.nodes.values(): tag = node["tag"] word = node["word"] if word is not None: if tag.startswith("V"): print(lmtzr.lemmatize(word, 'v')) else: print(lmtzr.lemmatize(word, 'n')) print() answer = find_answer(qgraph, sgraph) print("answer:", answer)
def textProcessing(dataset): tokens = [] sentencesTokenize = sent_tokenize(dataset) for item in sentencesTokenize: tokens.append(word_tokenize(item.lower())) sentences = copy.deepcopy(tokens) englishStopwords = stopwords.words('english') punctuations = list(string.punctuation) for i in range(len(tokens)): tokens[i] = ' '.join(tokens[i]) tokens[i] = re.sub(r"i'm", "i am", tokens[i]) tokens[i] = re.sub(r"n't", "not", tokens[i]) tokens[i] = re.sub(r"n 't", "not", tokens[i]) tokens[i] = re.sub(r"n' t", "not", tokens[i]) tokens[i] = re.sub(r"he's", "he is", tokens[i]) tokens[i] = re.sub(r"she's", "she is", tokens[i]) tokens[i] = re.sub(r"that's", "that is", tokens[i]) tokens[i] = re.sub(r"what's", "what is", tokens[i]) tokens[i] = re.sub(r"where's", "where is", tokens[i]) tokens[i] = re.sub(r"\'ll", " will", tokens[i]) tokens[i] = re.sub(r"\'ve", " have", tokens[i]) tokens[i] = re.sub(r"\'re", " are", tokens[i]) tokens[i] = re.sub(r"\'d", " would", tokens[i]) tokens[i] = re.sub(r"won't", "will not", tokens[i]) tokens[i] = re.sub(r"can't", "cannot", tokens[i]) tokens[i] = re.sub(r"don't", "do not", tokens[i]) tokens[i] = "".join(ch for ch in tokens[i] if ch not in punctuations) Tokens = [] for item in tokens: Tokens.append(word_tokenize(item.lower())) for i in range(len(Tokens)): Tokens[i] = [ value for value in Tokens[i] if value not in englishStopwords ] pos_tags = { NOUN: ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'WP', 'WP$'], VERB: ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'], ADJ: ['JJ', 'JJR', 'JJS'], ADV: ['RB', 'RBR', 'RBS', 'WRB'] } tagged_words = [] for token in Tokens: tagged_words.append(nltk.pos_tag(token)) pos_word = [] pos_words = [] for i in range(len(tagged_words)): pos_word = [] for j in range(len(tagged_words[i])): flag = False # pos_words.append([]) for key, value in pos_tags.items(): if tagged_words[i][j][1] in value: pos_word.append((tagged_words[i][j], key)) flag = True break if not flag: pos_word.append((tagged_words[i][j], NOUN)) pos_words.append(pos_word) normalized_words = [] lem = WordNetLemmatizer() for i in range(len(pos_words)): normalized_words.append( [lem.lemmatize(w[0], pos=p) for w, p in pos_words[i]]) return normalized_words, sentences
en_stop = en_stop + customList6 + customList7 + customList8 + customList9 + customList10 p_stemmer = PorterStemmer() lmtzr = WordNetLemmatizer() doc_set = ubuntu_rss_list texts = [] # Generate topics using LDA for i in doc_set: raw = i.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if not i in en_stop] stemmed_tokens = [lmtzr.lemmatize(i) for i in stopped_tokens] texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=5) for i in ldamodel.show_topics(): print(i[0], i[1]) end_time = time() time_taken = end_time - start_time
# create lemmatizer lem = WordNetLemmatizer() #stopword stop_words = set(stopwords.words('english')) df = pd.read_csv('textm_wine_reviews.csv', nrows=500, delimiter=',') label_data = pd.DataFrame(df, columns=['winery','province','country']) label = list(map(lambda x: ', '.join(x), label_data.values)) input_search = [ ' memory of a wine once made by his mother' ] list_document = list() vectorizer = CountVectorizer() for i in range(0, len(df)): tokenized_word=word_tokenize(df.loc[i][2]) filter_words = [w for w in tokenized_word if not w in stop_words] stemmed_word = list(map(lambda x: stemmer.stem(x), filter_words)) lemmatize_word = list(map(lambda x : lem.lemmatize(x, 'v'), stemmed_word)) output = ' '.join(lemmatize_word) X = vectorizer.fit_transform([output]) print(X.toarray()) Y = vectorizer.transform(input_search) list_document.append([label[i], sum(Y.data)]) pprint.pprint(sorted(list_document, key = lambda x: x[1], reverse = True)[0:7]) end = time.time() print('exe time Count Vectorizer: ', end-start)
# Original paper: http://web.simmons.edu/~benoit/lis466/PorterStemmingAlgorithm.pdf porter = PorterStemmer() print(types) print([porter.stem(x) for x in types]) print(porter.stem('city')) types = [ 'bed', 'kiss', 'tied', 'tis', 'universal', 'university', 'experiment', 'experience', 'past', 'paste', 'alumnus', 'alumni', 'adhere', 'adhesion', 'create', 'creation' ] porter_results = [porter.stem(x) for x in types] print(porter_results) from nltk.stem.wordnet import WordNetLemmatizer # See description: https://wordnet.princeton.edu/wordnet/man/morphy.7WN.html lemm = WordNetLemmatizer() lemm_results = [lemm.lemmatize(x) for x in types] print('%15s\t%15s\t%15s' % ('type', 'porter', 'lemmatizer')) print('\n') print('\n'.join([ '%15s\t%15s\t%15s' % (t[0], t[1], t[2]) for t in zip(types, porter_results, lemm_results) ])) print(lemm.lemmatize('are')) print(lemm.lemmatize('is')) print(lemm.lemmatize('are', 'v')) print(lemm.lemmatize('is', 'v'))
DATA_DIR = "MachineLearningGroupProject/data/" data = load_files(DATA_DIR, encoding='utf-8', decode_error='replace') labels, counts = np.unique(data.target, return_counts=True) labels_str = np.array(data.target_names)[labels] print(dict(zip(labels_str, counts))) #Tokenise and lemmatise the text data nltk.download('wordnet') lemmatiser = WordNetLemmatizer() tokeniser = CountVectorizer().build_tokenizer() for i in range(0, len(data.data)): temp_str = " " data.data[i] = data.data[i].lower() data.data[i] = tokeniser(data.data[i]) for token in range(0, len(data.data[i])): data.data[i][token] = lemmatiser.lemmatize(data.data[i][token]) data.data[i] = temp_str.join(data.data[i]) X_train, X_test, Y_train, Y_test = train_test_split(data.data, data.target) # print(X_test); vectoriser = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) x_train_counts = vectoriser.fit_transform(X_train) tf_transformer = TfidfTransformer(use_idf=False).fit(x_train_counts) x_train_tf = tf_transformer.transform(x_train_counts) model = LinearSVC() model.fit(x_train_tf, Y_train) y_pred = model.predict(vectoriser.transform(X_test)) print(accuracy_score(Y_test, y_pred))
def lemmatize_verb(verbs): lemmatizer = WordNetLemmatizer() return map(lambda x:lemmatizer.lemmatize(x, 'v'),verbs)
# List of valid lemmas included in current query # query : Project Gutenberg Literacy Archive Foundation # query_lemmas : project gutenberg archive foundation query_lemmas = [] for word, pos in pos_tag(wp_tokenizer.tokenize(query.lower().strip())): # It is proper to sanitize the query like we sanitized the documents documents when we built the index by stemming all the words, making everything lowercase, removing punctuation and apply the analysis applied while building the index. if( pos in CLOSED_TAGS or # search the closed tag set O(1) pattern.search(word) or # If includes a non-letter character word in stop_words # search for stop words O(1) ): continue pos = 'v' if (pos.startswith('VB')) else 'n' # If current term's appearance is verb related then the POS lemmatizer should be verb ('v'), otherwise ('n') if (word in inverted_file): query_lemmas.append(wnl_lemmatizer.lemmatize(word, pos)) # Stemming/Lemmatization if (len(query_lemmas) < 1): print "Querying: No relevant document!" continue # Standard query: After sanitizing/wrangling the input query we retrieve the inverted list of the remaining terms/lemmas and which we aggregate and union them. standard_query(query_lemmas) # Phrase query: After sanitizing/wrangling the input query we run a single word query for every lemma found and add each of these of results to our total list. We 'common_documents' the setted list that contains all the documents that contain all the words in the query. # Then we check them for ordering. So, for every list in the intermediate results, we first make a list of lists of the positions of each wordd in the input query. Then we use two nested for loops to iterate through this list of lists. If the words are in the proper order, phrase_query(query_lemmas) sys.exit(0)
def _remove_regex(input_text, regex_pattern): urls = re.finditer(regex_pattern, input_text) for i in urls: input_text = re.sub(i.group().strip(), '', input_text) return input_text regex_pattern = "#[\w]*" _remove_regex("remove this #hashtag from analytics vidhya", regex_pattern) lem = WordNetLemmatizer() stem = PorterStemmer() word = "multiplying" lem.lemmatize(word, "v") stem.stem(word) lookup_dict = {'rt':'Retweet', 'dm':'direct message', "awsm" : "awesome", "luv" :"love"} def _lookup_words(input_text): words = input_text.split() new_words = [] for word in words: if word.lower() in lookup_dict: word = lookup_dict[word.lower()] new_words.append(word) new_text = " ".join(new_words) return new_text _lookup_words("RT this is a retweeted tweet by Shivam Bansal")
text = text.lower() # remove tags text = re.sub("</?.*?>", " <> ", text) # Remove special characters and digits text = re.sub("(\\d|\\W)+", " ", text) ## Convert to list from string text = text.split() ## Stemming ps = PorterStemmer() # Lemmatisation lem = WordNetLemmatizer() text = [lem.lemmatize(word) for word in text if not word in stop_words] text = " ".join(text) corpus.append(text) #Word cloud wordcloud = WordCloud(background_color='white', stopwords=stop_words, max_words=100, max_font_size=50, random_state=42).generate(str(corpus)) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show() fig.savefig("word2.jpg", dpi=900)
def lemmatize(self, tokens): lemmatizer = WordNetLemmatizer() return [lemmatizer.lemmatize(token) for token in tokens]
from nltk.corpus import wordnet, stopwords from nltk.stem.wordnet import WordNetLemmatizer import pickle from collections import defaultdict import pdb stopWords = set(stopwords.words("english")) lmtzr = WordNetLemmatizer() en_vocab_file = open("../created_datas/en.vocab") en_vocab_with_senses = {} en_word_senses = defaultdict( list) #to keep all senses of a word in a dictionary for line in en_vocab_file: word = lmtzr.lemmatize(line.strip().lower()) senses = wordnet.synsets(line.strip()) if senses: for sense in senses: en_word_senses[word].append(sense.name()) en_vocab_with_senses[sense.name()] = [] with open("../created_datas/en_vocab_with_senses.pkl", "wb") as fw: pickle.dump(en_vocab_with_senses, fw) with open("../created_datas/en_words_wsynsets.pkl", "wb") as ws: pickle.dump(en_word_senses, ws)
# number removal spotify_file['Review'] = spotify_file['Review'].str.replace('\d+', '') spotify_file['Review'].head() # Lemmatization import nltk nltk.download('wordnet') from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet lemmatizer = WordNetLemmatizer() spotify_file['Review']= spotify_file['Review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word,'n')for word in x.split()])) spotify_file['Review']= spotify_file['Review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word,'v')for word in x.split()])) print(spotify_file['Review']) # spelling correction from autocorrect import spell spotify_file['Review']= spotify_file['Review'].apply(lambda x: " ".join([spell(i) for i in x.split()])) #replace words (depends on how word changes) spotify_file.Review = spotify_file.Review.str.replace('app', 'application') spotify_file.Review = spotify_file.Review.str.replace('specify', 'spotify') # Tokenization
f.write(obj) word_tokens = list(word_tokenize(obj)) # filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_sentence = [] for w in word_tokens: if w not in stop_words: filtered_sentence.append(w) t =[ lem.lemmatize(i.lower(), pos='v') for i in filtered_sentence] # t=filtered_sentence # print(t) filtered_sentence = [] for w in t: if w in keywords: filtered_sentence.append(w) # print("Extracted keywords-->"+str(filtered_sentence)) # print('*'*30) for w in filtered_sentence: if w in kw[0]: freq[0] += 1 if w in kw[1]: freq[1] += 1
#import nltk #path = "/media/mynewdrive/new_txt" #dirList = os.listdir(path) path = "/media/mynewdrive/pos_story.txt" f = open(path, "r") a = WordNetLemmatizer() for line in f: nline = line.rstrip() sep = nline.partition(" ") if sep[2] == "VERB": print a.lemmatize(sep[0], 'v') + " " + sep[2] elif sep[2] == "NOUN": print a.lemmatize(sep[0]) + " " + sep[2] elif sep[2] == "ADJ": print a.lemmatize(sep[0], 'a') + " " + sep[2] elif sep[2] == "ADV": print a.lemmatize(sep[0], 'r') + " " + sep[2] #while 1: #try: #except line == KeyboardInterrupt: # break #if not line: # break
""" rdfGraph = Graph() for triple in triples: sentence = str(triple).split(",")[1].split("=")[1][1:-1] + " " + str( str(triple).split(",")[2].split("=")[1])[1:-1] + " " + str( str(triple).split(",")[3].split("=")[1])[1:-2] s = str(triple).split(",")[1].split("=")[1][1:-1].lower() p = str(str(triple).split(",")[2].split("=")[1])[1:-1].lower() o = str(str(triple).split(",")[3].split("=")[1])[1:-2].lower() """ Post-processing the triples """ modified_p = lemmatizer.lemmatize(p, 'v') + " " + str(o.split(" ")[0]) if (modified_p in white_dict.keys() or modified_p in type_dict.keys()): p = modified_p o = " ".join(o.split(" ")[1:]) """ subject checking """ nps = [] doc = nlp(s) for np in doc.noun_chunks: nps.append(np.text) ''' Substituting the DBPedia Spotlight Resource URLS ''' subjects = [] if (len(nps) > 0):