def compress_twt_v2(self): global Pos_tag global Cont self.compressed_twt = [] for twt in self.reduced_twt: word = twt["word"] time = twt["time"] contraction_keys = Cont.keys() if word in contraction_keys: new_words = Cont[word][0].split(" ") for new_word in new_words: self.compressed_twt.append({ "word": new_word, "original_word": word, "time": time }) else: tag = Pos_tag( [word])[0][1] #-> Pos_tag(["geese"]) -> [("geese", "NN")] pos_type = "n" #default lemmatize to a nounce if tag.find("VB") != -1: #is a verb pos_type = "v" new_word = self.Lemma.lemmatize(word, pos=pos_type) self.compressed_twt.append({ "word": new_word, "orginal_word": word, "time": time })
def expand_contractions(text): """expand shortened words, e.g. don't to do not""" contractions_re = re.compile('(%s)' % '|'.join(contractions.keys())) def replace(match): return contractions[match.group(0)] return contractions_re.sub(replace, text)
def expand_contractions(tweet): import re from contractions import contractions tweet = tweet.lower() #convert U+2019 to U+0027 (apostrophe) tweet = tweet.replace(u"\u2019", u"\u0027") contractions_re = re.compile('(%s)' % '|'.join(contractions.keys())) def replace(match): # expand the contraction with the most possible alternative : [0] return contractions[match.group(0)][0] return contractions_re.sub(replace, tweet)
def eliminate_contraction(self): ''' does: transform contraction to full words ex: there's -> there is ''' global Cont global Pos_tag self.modified_transcript = "" reg = re.compile("\s+|\.") #splitted by space and dot keys = Cont.keys() for word in reg.split(self.full_transcript): if word != "" and word != "\n": if word in keys: word = Cont[word][0] else: tag = Pos_tag([word])[0][1] pos_type = "n" if tag == "VBR": #is a verb pos_type = "v" word = self.Lemma.lemmatize(word, pos=pos_type) self.modified_transcript += (" " + word)
def unfrequent_nouns(f): """ - take a recognition result (raw text) and filter out the top x most common words. (plus variants of those like plurals) (x = 500 right now). - remove short words (<3 chars) and words with "'" in them - export those words and their counts into json (stdout) """ top5000words_with_variants = set(top5000words) for w in top5000words: if len(w) >= 3: top5000words_with_variants |= {w + 's'} top5000words_with_variants |= {w + 'ing'} top5000words_with_variants |= {w + 'ting'} top5000words_with_variants |= {w + 'ed'} top5000words_with_variants |= {w + 'ped'} top5000words_with_variants |= {w + 'd'} top5000words_with_variants |= {w + '\'s'} top5000words_with_variants |= {w + '\'ll'} contractions = set(contractions_.keys()) top5000words_with_variants |= contractions hyp = [unicode(w.lower(), 'utf-8') for w in open(f).read().split()] # hyp_nouns = (w for w,pos in pos_tag(hyp) if # pos in ['NNP', 'NN', '-NONE-', 'NNS']) hyp_lemmatized = (lemmatize(w) for w in hyp) # remove short words and words with ' hyp_lemmatized = \ (w for w in hyp_lemmatized if len(w) >= 3 and not "'" in w) hyp_special = (w for w in hyp_lemmatized if w not in top5000words) bag = Counter(hyp_special) json_ = json.dumps( {word: count for word, count in bag.most_common()}, indent=2) print(json_)
def compress_twt(self): global Pos_tag global Cont self.compressed_twt = [] self.uncompressed_twt = [] compressed_twt = {} for twt in self.reduced_twt: word = twt["word"] time = twt["time"] contraction_keys = Cont.keys() if word in contraction_keys: self.uncompressed_twt.append({ "word": word, "original_word": word, "time": time }) else: tag = Pos_tag( [word])[0][1] #-> Pos_tag(["geese"]) -> [("geese", "NN")] pos_type = "n" #default lemmatize to a nounce if tag.find("VB") != -1: #is a verb pos_type = "v" new_word = self.Lemma.lemmatize(word, pos=pos_type) if not word in Stop_words: compressed_twt.setdefault(new_word, []) compressed_twt[new_word].append(time) self.uncompressed_twt.append({ "word": new_word, "orginal_word": word, "time": time }) for key in compressed_twt.keys(): self.compressed_twt.append({ "word": key, "time": compressed_twt[key] })
def remove_contractions(text, contraction): for word in contractions.keys(): if "" + word + "" in text: text = text.replace("" + word + "", "" + contractions[word] + "") return text