示例#1
0
 def translate_song(self, song_dict):
     result = dict(SongID=song_dict['file'])
     for key, tag in dict(SongName='title', ArtistName='artist',
             AlbumName='album').items():
         try:
             result[key] = util.asciify(song_dict[tag])
         except:
             # This song does not have that key.  Probably a .wav
             pass
     return result
def bigram_feats(md):
    c = Counter()
    for rev in util.MovieData.reviewers:
        if hasattr(md,rev):
            # count occurrences of asciified, lowercase, non-numeric unigrams
            # after removing punctuation
            wordList = util.punct_patt.sub("",
                         util.asciify(md.__dict__[rev].strip().lower())).split()
            wordList = [x for x in wordList if util.non_numeric(x)]
            bigrams = zip(wordList, wordList[1:])
            c.update(token for token in bigrams)
    return c
def revLens(md):
    """
    arguments:
      md is a util.MovieData object
    returns:
      dictionary with word lengths of each reviewer
    """
    d={}
    for rev in util.MovieData.reviewers:
        if hasattr(md,rev):
            revLen = len(util.punct_patt.sub("",
                         util.asciify(md.__dict__[rev].strip().lower())).split())
            d[rev+"_length"] = revLen
    return d
def unigram_feats(md):
    """
    arguments:
      md is a util.MovieData object
    returns:
      a dictionary containing a mapping from unigram features from the reviews
      to their values on this util.MovieData object
    """
    c = Counter()
    for rev in util.MovieData.reviewers:
        if hasattr(md,rev):
            # count occurrences of asciified, lowercase, non-numeric unigrams
            # after removing punctuation
            c.update([token for token in
                        util.punct_patt.sub("",
                         util.asciify(md.__dict__[rev].strip().lower())).split()
                          if util.non_numeric(token)])
    return c
示例#5
0
def unigram_feats(md):
    """
    arguments:
      md is a util.MovieData object
    returns:
      a dictionary containing a mapping from unigram features from the reviews
      to their values on this util.MovieData object
    """
    c = Counter()
    for rev in util.MovieData.reviewers:
        if hasattr(md,rev):
            # count occurrences of asciified, lowercase, non-numeric unigrams 
            # after removing punctuation
            c.update([token for token in 
                        util.punct_patt.sub("",
                         util.asciify(md.__dict__[rev].strip().lower())).split()
                          if util.non_numeric(token)])
    return c