예제 #1
0
def wikipedia_revision_pattern(films,
                               revision_dir,
                               output,
                               verbose=False,
                               day_limit=28):
    '''
    Generates a histogram of the Wikipedia article edit frequency in the days
    prior to films' release dates. 
    '''
    days_back = []
    for i in films.index:
        film = films.ix[i]
        if verbose:
            print(film['title'])
        revisions = load_wikipedia_revisions(film, revision_dir)
        for rev in revisions:
            days_back.append(
                (film['opening_date'] - rev['timestamp'].date()).days)
    plt.figure(figsize=(5, 2.5))
    ax = plt.axes()
    ax.hist(days_back, bins=day_limit)
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize('x-small')
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize('x-small')
    ax.set_xlim(left=1, right=day_limit)
    plt.savefig(output, facecolor='white')
    plt.close()
예제 #2
0
def wikipedia_revision_pattern(films, revision_dir, output, verbose=False,
                               day_limit=28):
    '''
    Generates a histogram of the Wikipedia article edit frequency in the days
    prior to films' release dates. 
    '''
    days_back = []
    for i in films.index:
        film = films.ix[i]
        if verbose:
            print(film['title'])
        revisions = load_wikipedia_revisions(film, revision_dir)
        for rev in revisions:
            days_back.append((film['opening_date'] - rev['timestamp'].date()).days)
    plt.figure(figsize=(5,2.5))
    ax = plt.axes()
    ax.hist(days_back, bins=day_limit)
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize('x-small')
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize('x-small')
    ax.set_xlim(left=1, right=day_limit)
    plt.savefig(output, facecolor='white')
    plt.close()
예제 #3
0
def generate_features(films, output_dir, add_const=False, verbose=False):
    '''
    For data in films, calculates all 
    '''

    response = films['opening_gross'] / films['opening_theaters']
    n = len(films.index)
    features = {
        'edit_runs_7_28': [0] * n,
        'edit_runs_0_7': [0] * n,
        'word_imax': [0] * n,
        'word_extfile': [0] * n,
        'word_headings': [0] * n,
        'avg_size': [0] * n,
        'similar_past_revenue': [0] * n,
        'genre_action': [0] * n,
        'genre_animation': [0] * n,
        'genre_arthouse': [0] * n,
        'genre_classics': [0] * n,
        'genre_comedy': [0] * n,
        'genre_cult': [0] * n,
        'genre_documentary': [0] * n,
        'genre_drama': [0] * n,
        'genre_horror': [0] * n,
        'genre_kids': [0] * n,
        'genre_musical': [0] * n,
        'genre_mystery': [0] * n,
        'genre_romance': [0] * n,
        'genre_scifi': [0] * n,
        'genre_special': [0] * n,
        'genre_sports': [0] * n,
        'genre_tv': [0] * n,
        'genre_western': [0] * n,
        'mpaa_g': [0] * n,
        'mpaa_pg': [0] * n,
        'mpaa_pg13': [0] * n,
        'release_friday': [0] * n,
    }

    for (i, film_i) in enumerate(films.index):

        film = films.ix[film_i]
        revisions = load_wikipedia_revisions(film, output_dir)
        if verbose:
            print '(%d) %s / %d revisions' % (film_i, film['wiki_title'],
                                              len(revisions))

        if film['wiki_title'] is None:
            raise Exception(
                'Error: no wiki_title found for film %s, index %i' %
                (film['title'], i))

        # Genre indicators

        if not pd.isnull(film['genres']):
            genres = set(film['genres'].split(','))
            if 'Action & Adventure' in genres:
                features['genre_action'][i] = 1
            if 'Animation' in genres:
                features['genre_animation'][i] = 1
            if 'Art House & International' in genres:
                features['genre_arthouse'][i] = 1
            if 'Classics' in genres:
                features['genre_classics'][i] = 1
            if 'Comedy' in genres:
                features['genre_comedy'][i] = 1
            if 'Cult Movies' in genres:
                features['genre_cult'][i] = 1
            if 'Documentary' in genres:
                features['genre_documentary'][i] = 1
            if 'Drama' in genres:
                features['genre_drama'][i] = 1
            if 'Horror' in genres:
                features['genre_horror'][i] = 1
            if 'Kids & Family' in genres:
                features['genre_kids'][i] = 1
            if 'Musical & Performing Arts' in genres:
                features['genre_musical'][i] = 1
            if 'Mystery & Suspense' in genres:
                features['genre_mystery'][i] = 1
            if 'Romance' in genres:
                features['genre_romance'][i] = 1
            if 'Science Fiction & Fantasy' in genres:
                features['genre_scifi'][i] = 1
            if 'Special Interest' in genres:
                features['genre_special'][i] = 1
            if 'Sports & Fitness' in genres:
                features['genre_sports'][i] = 1
            if 'Television' in genres:
                features['genre_tv'][i] = 1
            if 'Western' in genres:
                features['genre_western'][i] = 1

        # only a very few films unrated, so anything not in the above 3 buckets gets to be "R or UR"

        if film['mpaa_rating'] == 'G':
            features['mpaa_g'][i] = 1
        elif film['mpaa_rating'] == 'PG':
            features['mpaa_pg'][i] = 1
        elif film['mpaa_rating'] == 'PG-13':
            features['mpaa_pg13'][i] = 1

        if film['opening_date'].weekday() == 5:
            features['release_friday'][i] = 1

        # Revision-based features

        prev_editor = None
        edit_runs_0_7 = 0  # edit run = one string of consecutive edits by
        edit_runs_7_28 = 0  # the same author
        for rev in revisions:
            if rev['user'] != prev_editor:
                daydiff = (film['opening_date'] - rev['timestamp'].date()).days
                if daydiff <= 7:
                    edit_runs_0_7 += 1
                elif daydiff <= 28:
                    edit_runs_7_28 += 1
                prev_editor = rev['user']
        features['edit_runs_0_7'][i] = edit_runs_0_7
        features['edit_runs_7_28'][i] = edit_runs_7_28

        word_imax = np.array([0] * len(revisions))
        word_extfile = np.array([0] * len(revisions))
        word_headings = np.array([0] * len(revisions))
        sizes = np.array([0] * len(revisions))

        for (j, rev) in enumerate(revisions):
            if '*' in rev:
                content = rev['*'].lower()
                word_imax[j] = len(re.findall(r'\Wimax', content))
                word_extfile[j] = len(re.findall(r'File:.*|', content))
                word_headings[j] = len(re.findall(r'==.*==', content))
            sizes[j] = rev['size']

        if len(revisions) > 0:
            features['word_imax'][i] = word_imax.mean()
            features['word_extfile'][i] = word_extfile.mean()
            features['word_headings'][i] = word_headings.mean()
        features['avg_size'][i] = sizes.mean()

    features = pd.DataFrame(features, index=films.index)

    features['runtime'] = films['runtime']
    features['runtime'][features['runtime'].isnull()] = 0
    # features['opening_theaters'] = films['opening_theaters']
    features['year'] = films['year']

    return (features, response)
예제 #4
0
def generate_features(films, output_dir, add_const=False, verbose=False):
    
    '''
    For data in films, calculates all 
    '''
    
    response = films['opening_gross'] / films['opening_theaters']
    n = len(films.index)
    features = { 'edit_runs_7_28': [0] * n, 
                 'edit_runs_0_7': [0] * n,
                 'word_imax': [0] * n,
                 'word_extfile': [0] * n,
                 'word_headings': [0] * n,
                 'avg_size': [0] * n,
                 'similar_past_revenue': [0] * n,
                 'genre_action': [0] * n,
                 'genre_animation': [0] * n,
                 'genre_arthouse': [0] * n,
                 'genre_classics': [0] * n,
                 'genre_comedy': [0] * n,
                 'genre_cult': [0] * n,
                 'genre_documentary': [0] * n,
                 'genre_drama': [0] * n,
                 'genre_horror': [0] * n,
                 'genre_kids': [0] * n,
                 'genre_musical': [0] * n,
                 'genre_mystery': [0] * n,
                 'genre_romance': [0] * n,
                 'genre_scifi': [0] * n,
                 'genre_special': [0] * n,
                 'genre_sports': [0] * n,
                 'genre_tv': [0] * n,
                 'genre_western': [0] * n,
                 'mpaa_g': [0] * n,
                 'mpaa_pg': [0] * n,
                 'mpaa_pg13': [0] * n,
                 'release_friday': [0] * n,
               }
    
    for (i, film_i) in enumerate(films.index):
        
        film = films.ix[film_i]
        revisions = load_wikipedia_revisions(film, output_dir)
        if verbose:
            print '(%d) %s / %d revisions' % (film_i, film['wiki_title'], len(revisions))
            
        if film['wiki_title'] is None:
            raise Exception('Error: no wiki_title found for film %s, index %i' % (film['title'], i))
        
        # Genre indicators
        
        if not pd.isnull(film['genres']):
            genres = set(film['genres'].split(','))
            if 'Action & Adventure' in genres:
                features['genre_action'][i] = 1
            if 'Animation' in genres:
                features['genre_animation'][i] = 1
            if 'Art House & International' in genres:
                features['genre_arthouse'][i] = 1
            if 'Classics' in genres:
                features['genre_classics'][i] = 1
            if 'Comedy' in genres:
                features['genre_comedy'][i] = 1
            if 'Cult Movies' in genres:
                features['genre_cult'][i] = 1
            if 'Documentary' in genres:
                features['genre_documentary'][i] = 1
            if 'Drama' in genres:
                features['genre_drama'][i] = 1
            if 'Horror' in genres:
                features['genre_horror'][i] = 1
            if 'Kids & Family' in genres:
                features['genre_kids'][i] = 1
            if 'Musical & Performing Arts' in genres:
                features['genre_musical'][i] = 1
            if 'Mystery & Suspense' in genres:
                features['genre_mystery'][i] = 1
            if 'Romance' in genres:
                features['genre_romance'][i] = 1
            if 'Science Fiction & Fantasy' in genres:
                features['genre_scifi'][i] = 1
            if 'Special Interest' in genres:
                features['genre_special'][i] = 1
            if 'Sports & Fitness' in genres:
                features['genre_sports'][i] = 1
            if 'Television' in genres:
                features['genre_tv'][i] = 1
            if 'Western' in genres:
                features['genre_western'][i] = 1
        
        # only a very few films unrated, so anything not in the above 3 buckets gets to be "R or UR"
        
        if film['mpaa_rating'] == 'G':
            features['mpaa_g'][i] = 1
        elif film['mpaa_rating'] == 'PG':
            features['mpaa_pg'][i] = 1
        elif film['mpaa_rating'] == 'PG-13':
            features['mpaa_pg13'][i] = 1
        
        if film['opening_date'].weekday() == 5:
            features['release_friday'][i] = 1
        
        # Revision-based features
        
        prev_editor = None
        edit_runs_0_7 = 0    # edit run = one string of consecutive edits by
        edit_runs_7_28 = 0   # the same author
        for rev in revisions:
            if rev['user'] != prev_editor:
                daydiff = (film['opening_date'] - rev['timestamp'].date()).days
                if daydiff <= 7:
                    edit_runs_0_7 += 1
                elif daydiff <= 28:
                    edit_runs_7_28 += 1
                prev_editor = rev['user']
        features['edit_runs_0_7'][i] = edit_runs_0_7
        features['edit_runs_7_28'][i] = edit_runs_7_28
        
        word_imax = np.array([0] * len(revisions))
        word_extfile = np.array([0] * len(revisions))
        word_headings = np.array([0] * len(revisions))
        sizes = np.array([0] * len(revisions))
        
        for (j, rev) in enumerate(revisions):
            if '*' in rev:
                content = rev['*'].lower()
                word_imax[j] = len(re.findall(r'\Wimax', content))
                word_extfile[j] = len(re.findall(r'File:.*|', content))
                word_headings[j] = len(re.findall(r'==.*==', content))
            sizes[j] = rev['size']
                    
        if len(revisions) > 0:
            features['word_imax'][i] = word_imax.mean()
            features['word_extfile'][i] = word_extfile.mean()
            features['word_headings'][i] = word_headings.mean()
        features['avg_size'][i] = sizes.mean()
        
    features = pd.DataFrame(features, index=films.index)
    
    features['runtime'] = films['runtime']
    features['runtime'][features['runtime'].isnull()] = 0
    # features['opening_theaters'] = films['opening_theaters']
    features['year'] = films['year']
    
    return (features, response)