def wikipedia_revision_pattern(films, revision_dir, output, verbose=False, day_limit=28): ''' Generates a histogram of the Wikipedia article edit frequency in the days prior to films' release dates. ''' days_back = [] for i in films.index: film = films.ix[i] if verbose: print(film['title']) revisions = load_wikipedia_revisions(film, revision_dir) for rev in revisions: days_back.append( (film['opening_date'] - rev['timestamp'].date()).days) plt.figure(figsize=(5, 2.5)) ax = plt.axes() ax.hist(days_back, bins=day_limit) for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize('x-small') for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize('x-small') ax.set_xlim(left=1, right=day_limit) plt.savefig(output, facecolor='white') plt.close()
def wikipedia_revision_pattern(films, revision_dir, output, verbose=False, day_limit=28): ''' Generates a histogram of the Wikipedia article edit frequency in the days prior to films' release dates. ''' days_back = [] for i in films.index: film = films.ix[i] if verbose: print(film['title']) revisions = load_wikipedia_revisions(film, revision_dir) for rev in revisions: days_back.append((film['opening_date'] - rev['timestamp'].date()).days) plt.figure(figsize=(5,2.5)) ax = plt.axes() ax.hist(days_back, bins=day_limit) for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize('x-small') for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize('x-small') ax.set_xlim(left=1, right=day_limit) plt.savefig(output, facecolor='white') plt.close()
def generate_features(films, output_dir, add_const=False, verbose=False): ''' For data in films, calculates all ''' response = films['opening_gross'] / films['opening_theaters'] n = len(films.index) features = { 'edit_runs_7_28': [0] * n, 'edit_runs_0_7': [0] * n, 'word_imax': [0] * n, 'word_extfile': [0] * n, 'word_headings': [0] * n, 'avg_size': [0] * n, 'similar_past_revenue': [0] * n, 'genre_action': [0] * n, 'genre_animation': [0] * n, 'genre_arthouse': [0] * n, 'genre_classics': [0] * n, 'genre_comedy': [0] * n, 'genre_cult': [0] * n, 'genre_documentary': [0] * n, 'genre_drama': [0] * n, 'genre_horror': [0] * n, 'genre_kids': [0] * n, 'genre_musical': [0] * n, 'genre_mystery': [0] * n, 'genre_romance': [0] * n, 'genre_scifi': [0] * n, 'genre_special': [0] * n, 'genre_sports': [0] * n, 'genre_tv': [0] * n, 'genre_western': [0] * n, 'mpaa_g': [0] * n, 'mpaa_pg': [0] * n, 'mpaa_pg13': [0] * n, 'release_friday': [0] * n, } for (i, film_i) in enumerate(films.index): film = films.ix[film_i] revisions = load_wikipedia_revisions(film, output_dir) if verbose: print '(%d) %s / %d revisions' % (film_i, film['wiki_title'], len(revisions)) if film['wiki_title'] is None: raise Exception( 'Error: no wiki_title found for film %s, index %i' % (film['title'], i)) # Genre indicators if not pd.isnull(film['genres']): genres = set(film['genres'].split(',')) if 'Action & Adventure' in genres: features['genre_action'][i] = 1 if 'Animation' in genres: features['genre_animation'][i] = 1 if 'Art House & International' in genres: features['genre_arthouse'][i] = 1 if 'Classics' in genres: features['genre_classics'][i] = 1 if 'Comedy' in genres: features['genre_comedy'][i] = 1 if 'Cult Movies' in genres: features['genre_cult'][i] = 1 if 'Documentary' in genres: features['genre_documentary'][i] = 1 if 'Drama' in genres: features['genre_drama'][i] = 1 if 'Horror' in genres: features['genre_horror'][i] = 1 if 'Kids & Family' in genres: features['genre_kids'][i] = 1 if 'Musical & Performing Arts' in genres: features['genre_musical'][i] = 1 if 'Mystery & Suspense' in genres: features['genre_mystery'][i] = 1 if 'Romance' in genres: features['genre_romance'][i] = 1 if 'Science Fiction & Fantasy' in genres: features['genre_scifi'][i] = 1 if 'Special Interest' in genres: features['genre_special'][i] = 1 if 'Sports & Fitness' in genres: features['genre_sports'][i] = 1 if 'Television' in genres: features['genre_tv'][i] = 1 if 'Western' in genres: features['genre_western'][i] = 1 # only a very few films unrated, so anything not in the above 3 buckets gets to be "R or UR" if film['mpaa_rating'] == 'G': features['mpaa_g'][i] = 1 elif film['mpaa_rating'] == 'PG': features['mpaa_pg'][i] = 1 elif film['mpaa_rating'] == 'PG-13': features['mpaa_pg13'][i] = 1 if film['opening_date'].weekday() == 5: features['release_friday'][i] = 1 # Revision-based features prev_editor = None edit_runs_0_7 = 0 # edit run = one string of consecutive edits by edit_runs_7_28 = 0 # the same author for rev in revisions: if rev['user'] != prev_editor: daydiff = (film['opening_date'] - rev['timestamp'].date()).days if daydiff <= 7: edit_runs_0_7 += 1 elif daydiff <= 28: edit_runs_7_28 += 1 prev_editor = rev['user'] features['edit_runs_0_7'][i] = edit_runs_0_7 features['edit_runs_7_28'][i] = edit_runs_7_28 word_imax = np.array([0] * len(revisions)) word_extfile = np.array([0] * len(revisions)) word_headings = np.array([0] * len(revisions)) sizes = np.array([0] * len(revisions)) for (j, rev) in enumerate(revisions): if '*' in rev: content = rev['*'].lower() word_imax[j] = len(re.findall(r'\Wimax', content)) word_extfile[j] = len(re.findall(r'File:.*|', content)) word_headings[j] = len(re.findall(r'==.*==', content)) sizes[j] = rev['size'] if len(revisions) > 0: features['word_imax'][i] = word_imax.mean() features['word_extfile'][i] = word_extfile.mean() features['word_headings'][i] = word_headings.mean() features['avg_size'][i] = sizes.mean() features = pd.DataFrame(features, index=films.index) features['runtime'] = films['runtime'] features['runtime'][features['runtime'].isnull()] = 0 # features['opening_theaters'] = films['opening_theaters'] features['year'] = films['year'] return (features, response)
def generate_features(films, output_dir, add_const=False, verbose=False): ''' For data in films, calculates all ''' response = films['opening_gross'] / films['opening_theaters'] n = len(films.index) features = { 'edit_runs_7_28': [0] * n, 'edit_runs_0_7': [0] * n, 'word_imax': [0] * n, 'word_extfile': [0] * n, 'word_headings': [0] * n, 'avg_size': [0] * n, 'similar_past_revenue': [0] * n, 'genre_action': [0] * n, 'genre_animation': [0] * n, 'genre_arthouse': [0] * n, 'genre_classics': [0] * n, 'genre_comedy': [0] * n, 'genre_cult': [0] * n, 'genre_documentary': [0] * n, 'genre_drama': [0] * n, 'genre_horror': [0] * n, 'genre_kids': [0] * n, 'genre_musical': [0] * n, 'genre_mystery': [0] * n, 'genre_romance': [0] * n, 'genre_scifi': [0] * n, 'genre_special': [0] * n, 'genre_sports': [0] * n, 'genre_tv': [0] * n, 'genre_western': [0] * n, 'mpaa_g': [0] * n, 'mpaa_pg': [0] * n, 'mpaa_pg13': [0] * n, 'release_friday': [0] * n, } for (i, film_i) in enumerate(films.index): film = films.ix[film_i] revisions = load_wikipedia_revisions(film, output_dir) if verbose: print '(%d) %s / %d revisions' % (film_i, film['wiki_title'], len(revisions)) if film['wiki_title'] is None: raise Exception('Error: no wiki_title found for film %s, index %i' % (film['title'], i)) # Genre indicators if not pd.isnull(film['genres']): genres = set(film['genres'].split(',')) if 'Action & Adventure' in genres: features['genre_action'][i] = 1 if 'Animation' in genres: features['genre_animation'][i] = 1 if 'Art House & International' in genres: features['genre_arthouse'][i] = 1 if 'Classics' in genres: features['genre_classics'][i] = 1 if 'Comedy' in genres: features['genre_comedy'][i] = 1 if 'Cult Movies' in genres: features['genre_cult'][i] = 1 if 'Documentary' in genres: features['genre_documentary'][i] = 1 if 'Drama' in genres: features['genre_drama'][i] = 1 if 'Horror' in genres: features['genre_horror'][i] = 1 if 'Kids & Family' in genres: features['genre_kids'][i] = 1 if 'Musical & Performing Arts' in genres: features['genre_musical'][i] = 1 if 'Mystery & Suspense' in genres: features['genre_mystery'][i] = 1 if 'Romance' in genres: features['genre_romance'][i] = 1 if 'Science Fiction & Fantasy' in genres: features['genre_scifi'][i] = 1 if 'Special Interest' in genres: features['genre_special'][i] = 1 if 'Sports & Fitness' in genres: features['genre_sports'][i] = 1 if 'Television' in genres: features['genre_tv'][i] = 1 if 'Western' in genres: features['genre_western'][i] = 1 # only a very few films unrated, so anything not in the above 3 buckets gets to be "R or UR" if film['mpaa_rating'] == 'G': features['mpaa_g'][i] = 1 elif film['mpaa_rating'] == 'PG': features['mpaa_pg'][i] = 1 elif film['mpaa_rating'] == 'PG-13': features['mpaa_pg13'][i] = 1 if film['opening_date'].weekday() == 5: features['release_friday'][i] = 1 # Revision-based features prev_editor = None edit_runs_0_7 = 0 # edit run = one string of consecutive edits by edit_runs_7_28 = 0 # the same author for rev in revisions: if rev['user'] != prev_editor: daydiff = (film['opening_date'] - rev['timestamp'].date()).days if daydiff <= 7: edit_runs_0_7 += 1 elif daydiff <= 28: edit_runs_7_28 += 1 prev_editor = rev['user'] features['edit_runs_0_7'][i] = edit_runs_0_7 features['edit_runs_7_28'][i] = edit_runs_7_28 word_imax = np.array([0] * len(revisions)) word_extfile = np.array([0] * len(revisions)) word_headings = np.array([0] * len(revisions)) sizes = np.array([0] * len(revisions)) for (j, rev) in enumerate(revisions): if '*' in rev: content = rev['*'].lower() word_imax[j] = len(re.findall(r'\Wimax', content)) word_extfile[j] = len(re.findall(r'File:.*|', content)) word_headings[j] = len(re.findall(r'==.*==', content)) sizes[j] = rev['size'] if len(revisions) > 0: features['word_imax'][i] = word_imax.mean() features['word_extfile'][i] = word_extfile.mean() features['word_headings'][i] = word_headings.mean() features['avg_size'][i] = sizes.mean() features = pd.DataFrame(features, index=films.index) features['runtime'] = films['runtime'] features['runtime'][features['runtime'].isnull()] = 0 # features['opening_theaters'] = films['opening_theaters'] features['year'] = films['year'] return (features, response)