예제 #1
0
 def compress_twt_v2(self):
     global Pos_tag
     global Cont
     self.compressed_twt = []
     for twt in self.reduced_twt:
         word = twt["word"]
         time = twt["time"]
         contraction_keys = Cont.keys()
         if word in contraction_keys:
             new_words = Cont[word][0].split(" ")
             for new_word in new_words:
                 self.compressed_twt.append({
                     "word": new_word,
                     "original_word": word,
                     "time": time
                 })
         else:
             tag = Pos_tag(
                 [word])[0][1]  #-> Pos_tag(["geese"]) -> [("geese", "NN")]
             pos_type = "n"  #default lemmatize to a nounce
             if tag.find("VB") != -1:  #is a verb
                 pos_type = "v"
             new_word = self.Lemma.lemmatize(word, pos=pos_type)
             self.compressed_twt.append({
                 "word": new_word,
                 "orginal_word": word,
                 "time": time
             })
예제 #2
0
def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))

    def replace(match):
        return contractions[match.group(0)]

    return contractions_re.sub(replace, text)
예제 #3
0
def expand_contractions(tweet):
    import re
    from contractions import contractions
    tweet = tweet.lower()
    #convert U+2019 to U+0027 (apostrophe)
    tweet = tweet.replace(u"\u2019", u"\u0027")
    contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))

    def replace(match):
        # expand the contraction with the most possible alternative : [0]
        return contractions[match.group(0)][0]

    return contractions_re.sub(replace, tweet)
예제 #4
0
    def eliminate_contraction(self):
        '''
			does: transform contraction to full words ex: there's -> there is
		'''
        global Cont
        global Pos_tag

        self.modified_transcript = ""
        reg = re.compile("\s+|\.")  #splitted by space and dot
        keys = Cont.keys()
        for word in reg.split(self.full_transcript):
            if word != "" and word != "\n":
                if word in keys:
                    word = Cont[word][0]
                else:
                    tag = Pos_tag([word])[0][1]
                    pos_type = "n"
                    if tag == "VBR":  #is a verb
                        pos_type = "v"
                    word = self.Lemma.lemmatize(word, pos=pos_type)
                self.modified_transcript += (" " + word)
예제 #5
0
def unfrequent_nouns(f):
    """
    - take a recognition result (raw text) and filter out the top x most common words. (plus variants of those like plurals) (x = 500 right now).
    - remove short words (<3 chars) and words with "'" in them
    - export those words and their counts into json (stdout)
    """

    top5000words_with_variants = set(top5000words)

    for w in top5000words:
        if len(w) >= 3:
            top5000words_with_variants |= {w + 's'}
            top5000words_with_variants |= {w + 'ing'}
            top5000words_with_variants |= {w + 'ting'}
            top5000words_with_variants |= {w + 'ed'}
            top5000words_with_variants |= {w + 'ped'}
            top5000words_with_variants |= {w + 'd'}
            top5000words_with_variants |= {w + '\'s'}
            top5000words_with_variants |= {w + '\'ll'}

    contractions = set(contractions_.keys())
    top5000words_with_variants |= contractions

    hyp = [unicode(w.lower(), 'utf-8') for w in open(f).read().split()]
    # hyp_nouns = (w for w,pos in pos_tag(hyp) if 
        # pos in ['NNP', 'NN', '-NONE-', 'NNS'])
    hyp_lemmatized = (lemmatize(w) for w in hyp)

    # remove short words and words with '
    hyp_lemmatized = \
        (w for w in hyp_lemmatized if 
            len(w) >= 3 and not "'" in w)

    hyp_special = (w for w in hyp_lemmatized if w not in top5000words)
    bag = Counter(hyp_special)

    json_ = json.dumps(
        {word: count for word, count in bag.most_common()}, indent=2)
    print(json_)
예제 #6
0
 def compress_twt(self):
     global Pos_tag
     global Cont
     self.compressed_twt = []
     self.uncompressed_twt = []
     compressed_twt = {}
     for twt in self.reduced_twt:
         word = twt["word"]
         time = twt["time"]
         contraction_keys = Cont.keys()
         if word in contraction_keys:
             self.uncompressed_twt.append({
                 "word": word,
                 "original_word": word,
                 "time": time
             })
         else:
             tag = Pos_tag(
                 [word])[0][1]  #-> Pos_tag(["geese"]) -> [("geese", "NN")]
             pos_type = "n"  #default lemmatize to a nounce
             if tag.find("VB") != -1:  #is a verb
                 pos_type = "v"
             new_word = self.Lemma.lemmatize(word, pos=pos_type)
             if not word in Stop_words:
                 compressed_twt.setdefault(new_word, [])
                 compressed_twt[new_word].append(time)
             self.uncompressed_twt.append({
                 "word": new_word,
                 "orginal_word": word,
                 "time": time
             })
     for key in compressed_twt.keys():
         self.compressed_twt.append({
             "word": key,
             "time": compressed_twt[key]
         })
예제 #7
0
def remove_contractions(text, contraction):
    for word in contractions.keys():
        if "" + word + "" in text:
            text = text.replace("" + word + "", "" + contractions[word] + "")
    return text