def restore_database(self): if not os.path.exists(Args.backup_db_dir()): os.makedirs(Args.backup_db_dir()) cns_command = f'"{self.__mongorestore_path}" "{os.path.abspath(Args.backup_db_dir())}"' subprocess.check_output(cns_command)
def backup_database(self): if not os.path.exists(Args.backup_db_dir()): os.makedirs(Args.backup_db_dir()) cns_command = f'"{self.__mongodump_path}" --collection videos --db videos_analysis' \ f' --out "{os.path.abspath(Args.backup_db_dir())}"' subprocess.check_output(cns_command)
def get_args(): args = Args() add_standard_args(args) add_agent_args(args) all_args = args.all.parse_args() agent_args = args.agent.parse_known_args() return all_args, agent_args
def __detailed_analysis_for_all_countries(self): os.system('cls') print('Please, wait...') data = self.__db.get_videos_by_country_codes(list( self.__country_codes)) data_frame = pd.DataFrame(data) del data if data_frame.size > 0: output_directory = os.path.join( Args.analysis_res_dir(), f'all_country{os.sep}{time.strftime("%d.%m.%y")}{os.sep}') print('>>> General analysis is carried out') self.__general_analysis_for_data(data_frame, output_directory) print('>>> General report is completed!') print('>>> Detailed analysis is carried out') self.__detailed_analysis_for_data(data_frame, output_directory) print('>>> Detailed analysis is completed!') # subprocess.Popen(f'explorer /select, {output_directory}') os.startfile(output_directory) else: print('No data for analysis!') del data_frame
def launch(self, hours=23, minutes=30, country_codes_path=Args.country_codes_path()): scrap_time = datetime.combine(date.today(), time(hours, minutes)) while True: current_time = datetime.today() delta_time = (scrap_time - current_time) scrap_time = scrap_time.fromtimestamp(scrap_time.timestamp() + abs(delta_time.days) * 86400) print( f'>>> Next scrap will be {scrap_time.strftime("%Y.%m.%d-%H:%M:%S")}' ) t.sleep(delta_time.seconds) new_data = match_category_id_with_category_title( self.scraper.get_videos_data_by_country_codes_from_file( country_codes_path)) print(f'>>> New {len(new_data)} data videos received!') count = self.db.save_many_videos(new_data) print(f'>>> Saved {count} data videos to database!') save_videos_data_into_csv(new_data)
def __create_and_save_word_cloud(data, filename, output_dir=Args.analysis_res_dir(), user_stopwords=STOPWORDS, bg_color='black', max_words=100, max_font_size=120): plt.figure(figsize=(20, 20)) cloud = WordCloud(stopwords=user_stopwords, background_color=bg_color, max_words=max_words, max_font_size=max_font_size, width=1600, height=800).generate(data) plt.imshow(cloud) plt.axis('off') if not os.path.exists(output_dir): os.makedirs(output_dir) plt.savefig(os.path.join(output_dir, filename), facecolor='k', bbox_inches='tight') plt.close()
def views_likes_dislikes_comments_normal_distribution( data: DataFrame, output_dir=Args.analysis_res_dir()): data['likes_log'] = np.log(data['likes'] + 1) data['view_count_log'] = np.log(data['view_count'] + 1) data['dislikes_log'] = np.log(data['dislikes'] + 1) data['comment_log'] = np.log(data['comment_count'] + 1) plt.figure(figsize=(12, 6)) plt.subplot(221) g1 = sns.distplot(data['view_count_log']) g1.set_title("VIEWS LOG DISTRIBUTION", fontsize=16) plt.subplot(224) g2 = sns.distplot(data['likes_log'], color='green') g2.set_title('LIKES LOG DISTRIBUTION', fontsize=16) plt.subplot(223) g3 = sns.distplot(data['dislikes_log'], color='r') g3.set_title("DISLIKES LOG DISTRIBUTION", fontsize=16) plt.subplot(222) g4 = sns.distplot(data['comment_log']) g4.set_title("COMMENTS LOG DISTRIBUTION", fontsize=16) plt.subplots_adjust(wspace=0.2, hspace=0.4, top=0.9) __save_figure(plt, output_dir, 'normal_distribution.png') plt.close()
def category_rating(data: DataFrame, output_dir=Args.analysis_res_dir()): plt.figure(figsize=(30, 9)) plot = sns.countplot(data['category'], order=data['category'].value_counts().index) plot.set_title("Counting the Video Category's ", fontsize=20) plot.set_xlabel("", fontsize=20) plot.set_ylabel("Count", fontsize=20) __save_figure(plot.get_figure(), output_dir, 'category_rating.png') plt.close()
def __init__(self, uri=Args.db_host()): self.__client = MongoClient(uri) self.__db = self.__client["videos_analysis"] self.__videos_coll = self.__db["videos"] self.__videos_coll.create_index([("county_code", pymongo.DESCENDING)]) self.__mongodump_path = os.getenv('MONGODUMP_PATH') self.__mongorestore_path = os.getenv('MONGORESTORE_PATH')
def distribution_of_days_histogram(data: DataFrame, output_dir=Args.analysis_res_dir()): data = distribution_of_days_preprocessing(data) plt.figure(figsize=(20, 9)) plot = sns.countplot(data['interval']) plt.title('Distribution of interval') __save_figure(plot.get_figure(), output_dir, 'distribution_of_days_histogram.png') plt.close()
def __add_country_codes_from_file(self): os.system('cls') file_path = input( f'Enter file path (Default="{Args.country_codes_path()}"): ') if len(file_path) == 0: file_path = Args.country_codes_path() try: self.__add_country_codes(get_data_from_file(file_path)) except FileNotFoundError as e: print(f'{e.strerror}: "{e.filename}"')
def word_cloud_for_titles(data: DataFrame, output_dir=Args.analysis_res_dir()): title_word = data['title'].str.lower().str.cat(sep=' ') # word_tokens = word_tokenize(title_word) # filtered_sentence = [w for w in word_tokens if not w in all_stopwords] # without_single_chr = [word for word in filtered_sentence if len(word) > 2] # cleaned_data_title = [word for word in without_single_chr if not word.isdigit()] __create_and_save_word_cloud(data=title_word, filename='word_cloud_for_titles.png', user_stopwords=USER_STOPWORDS, output_dir=output_dir) del title_word
def __detailed_analysis_for_each_country_separately(self): os.system('cls') print('Please, wait...') is_analyze = False for code in self.__country_codes: print(f'COUNTRY: {pycountry.countries.get(alpha_2=code).name}') data = self.__db.get_videos_by_country_code(code) data_frame = pd.DataFrame(data) del data if data_frame.size == 0: print(f'No data for analysis {code}!') continue is_analyze = True output_directory = os.path.join( Args.analysis_res_dir(), f'{code}{os.sep}{time.strftime("%d.%m.%y")}{os.sep}') print('>>> General analysis is carried out') self.__general_analysis_for_data(data_frame, output_directory) print('>>> General report is completed!') print('>>> Detailed analysis is carried out') self.__detailed_analysis_for_data(data_frame, output_directory) print('>>> Detailed analysis is completed!') del data_frame if is_analyze: os.startfile(Args.analysis_res_dir())
def distribution_plot(data: DataFrame, output_dir=Args.analysis_res_dir()): general_view = pd.DataFrame( data[['view_count', 'likes', 'dislikes', 'comment_count']].groupby(data['category']).mean()) plt.figure(figsize=(32, 20)) plt.subplot(2, 2, 1) plt.plot(general_view.index, 'view_count', data=general_view, color='blue', linewidth=2, linestyle='solid') plt.title('View_count vs Category') plt.xticks(rotation=30) plt.subplot(2, 2, 2) plt.plot(general_view.index, 'likes', data=general_view, color='green', linewidth=2, linestyle='dotted') plt.title('Likes vs Category') plt.xticks(rotation=30) plt.subplot(2, 2, 3) plt.plot(general_view.index, 'dislikes', data=general_view, color='black', linewidth=2, linestyle='dashed') plt.title('Dislikes vs Category') plt.xticks(rotation=30) plt.subplot(2, 2, 4) plt.plot(general_view.index, 'comment_count', data=general_view, color='red', linewidth=2, linestyle='dashdot') plt.title('Comment_count vs Category') plt.xticks(rotation=30) __save_figure(plt, output_dir, 'distribution_plot.png') plt.close()
def distribution_of_average_time(data: DataFrame, output_dir=Args.analysis_res_dir()): data = distribution_of_days_preprocessing(data) df_t = pd.DataFrame(data['interval'].groupby( data['category']).mean()).sort_values(by="interval") plt.figure(figsize=(20, 9)) plt.plot(df_t, color='skyblue', linewidth=2) plt.title("Average Days to be trending video", fontsize=20) plt.xlabel('Category', fontsize=16) plt.ylabel('Average Time Interval', fontsize=16) plt.xticks(rotation=30) __save_figure(plt, output_dir, 'distribution_of_average_time.png') plt.close()
def save_videos_data_into_csv( videos_data: list, file_name=f"{time.strftime('%d-%m-%y_%H.%M.%S')}_videos.csv", output_dir=Args.raw_data_dir()): if videos_data is None: raise ValueError('Videos data can`t be None!') csv_data = [','.join(videos_data[0].keys())] for video in videos_data: if video is None: continue csv_data.append(','.join([prepare_feature_for_csv(val) for val in video.values()])) write_to_file( output_dir, file_name, csv_data)
def match_category_id_with_category_title(videos_data: list, category_id_file_path=None) -> list: if videos_data is None: raise ValueError('Videos data can`t be None!') if category_id_file_path is None: category_id_file_path = Args.category_id_file_path() categories = {} with open(category_id_file_path, 'r') as f: category = json.load(f) for i in category['items']: categories[int(i['id'])] = i['snippet']['title'] for video in videos_data: if 'category' not in video: category_id = video.pop('category_id') video['category'] = categories.get(int(category_id), category_id) return videos_data
def distribution_boxplot(data: DataFrame, output_dir=Args.analysis_res_dir()): view_count = np.log(data['view_count'] + 1) likes = np.log(data['likes'] + 1) dislikes = np.log(data['dislikes'] + 1) comment = np.log(data['comment_count'] + 1) data_count = pd.concat([view_count, likes, dislikes, comment], axis=1) data_count.index = data['category'] data_count = data_count[(data_count != 0)] plt.figure(figsize=(32, 20)) plt.subplot(2, 2, 1) sns.boxplot(data_count.index, 'view_count', data=data_count, order=data['category'].value_counts().index) plt.xticks(rotation=30, fontsize=12) plt.subplot(2, 2, 2) sns.boxplot(data_count.index, 'likes', data=data_count, order=data['category'].value_counts().index) plt.xticks(rotation=30, fontsize=12) plt.subplot(2, 2, 3) sns.boxplot(data_count.index, 'dislikes', data=data_count, order=data['category'].value_counts().index) plt.xticks(rotation=30, fontsize=12) plt.subplot(2, 2, 4) sns.boxplot(data_count.index, 'comment_count', data=data_count, order=data['category'].value_counts().index) plt.xticks(rotation=30, fontsize=12) __save_figure(plt, output_dir, 'distribution_boxplot.png') plt.close()
def __init__(self): self.__api_key = Args.api_key()
def add_standard_args(args:Args): args.add( '-e', '--env-id', nargs='?', default='Deepdrive-v0', help='Select the environment to run') args.add( '-r', '--record', action='store_true', default=False, help='Records game driving, including recovering from random actions') args.add( '--discrete-actions', action='store_true', default=False, help='Use discrete, rather than continuous actions') args.add( '--recording-dir', nargs='?', default=c.RECORDING_DIR, help='Where to store and read recorded environment data from') args.add( '--render', action='store_true', default=False, help='Show the cameras as seen your agents in Python') args.add( '--sync', action='store_true', default=False, help='Use synchronous stepping mode where the simulation advances only ' 'when calling step') args.add( '--sim-step-time', type=float, default=c.DEFAULT_SIM_STEP_TIME, help='Time to pause sim in synchronous stepping mode') args.add( '--enable-traffic', action='store_true', default=False, help='Enable traffic within the simulator') args.add( '--randomize-sun-speed', action='store_true', default=False, help='Whether to randomize the virtual speed of the earth\'s orbit ' 'around the sun') args.add( '--randomize-view-mode', action='store_true', default=False, help='Whether to randomize view mode on episode reset') args.add( '--randomize-shadow-level', action='store_true', default=False, help='Whether to randomize virtual position of Earth around Sun via ' 'month') args.add( '--randomize-month', action='store_true', default=False, help='Whether to randomize shadow quality render levels') args.add( '--path-follower', action='store_true', default=False, help='Whether to let the in-game path follower drive') args.add( '--eval-only', action='store_true', default=False, help='Whether to just run evaluation, i.e. disable gradient updates', ) args.add( '--driving-style', nargs='?', default=DrivingStyle.NORMAL.as_string(), help='Speed vs comfort prioritization, i.e. ' + ', '.join([level.name.lower() for level in DrivingStyle])) args.add( '--remote', action='store_true', default=False, help='Use API to connect to a remote environment') args.add( '-v', '--verbose', help='Increase output verbosity', action='store_true') args.add( '--camera-rigs', nargs='?', default=None, help='Name of camera rigs to use') args.add( '--experiment', nargs='?', default=None, help='Name of your experiment') args.add( '--fps', type=int, default=c.DEFAULT_FPS, help='Frames or steps per second') args.add( '--ego-mph', type=float, default=25, help='Ego (i.e. main) agent vehicle miles per hour') args.add( '--view-mode-period', type=int, default=None, help='Number of steps between view mode switches') args.add( '--max-steps', type=int, default=None, help='Max number of steps to run per episode') args.add( '--max-episodes', type=int, default=None, help='Maximum number of episodes') args.add( '--server', action='store_true', default=False, help='Run as an API server - serializes in pyarrow', ) args.add( '--json-server', action='store_true', default=False, help='Run as a JSON API server - serializes with JSON', ) args.add( '--upload-gist', action='store_true', default=False, help='Upload a private gist with driving performance' 'stats csv files', ) args.add( '--public', action='store_true', default=False, help='Results will be made public, i.e. artifacts like ' 'https://gist.github.com/deepdrive-results/cce0a164498c17269ce2adea2a88ec95', ) args.add( '--image-resize-dims', nargs='?', default=json.dumps(MOBILENET_V2_IMAGE_SHAPE), help='Resize the image coming from the cameras. This was added as ' 'we trained MNET (224x224) on old AlexNet data (227x227), and' 'wanted to test using the same transformation.') args.add( '--update-sim', action='store_true', default=False, help='Update sim to the latest version', ) args.add( '--scenario', type=int, default=c.DEFAULT_SCENARIO_INDEX, help='Scenario index to run 0-5 are Kevindale scenarios') args.add('--map', nargs='?', default='', help='The Unreal Map to load - options: ' + ', '.join(c.MAP_LOOKUP.keys()))
def correlation(data: DataFrame, output_dir=Args.analysis_res_dir()): corr = data[['view_count', 'likes', 'dislikes', 'comment_count']].corr() plot = sns.heatmap(corr, cmap='Blues', annot=True) __save_figure(plot.get_figure(), output_dir, 'correlation.png') plt.close()
def sentiment_analysis(data: DataFrame, output_dir=Args.analysis_res_dir()): category_list = data['category'].unique() # Collect all the related stopwords. en_stopwords = list(stopwords.words('english')) de_stopwords = list(stopwords.words('german')) fr_stopwords = list(stopwords.words('french')) ru_stopwords = list(stopwords.words('russian')) en_stopwords.extend(de_stopwords) en_stopwords.extend(fr_stopwords) en_stopwords.extend(ru_stopwords) polarities = list() MAX_N = 10000 for i in category_list: print(f'>> {i}') tags_word = data[data['category'] == i]['tags'].str.lower().str.cat( sep=' ') # removes punctuation,numbers and returns list of words tags_word = re.sub('[^A-Za-z]+', ' ', tags_word) word_tokens = word_tokenize(tags_word) filtered_sentence = [w for w in word_tokens if not w in en_stopwords] without_single_chr = [ word for word in filtered_sentence if len(word) > 2 ] # Remove numbers cleaned_data_title = [ word for word in without_single_chr if not word.isdigit() ] # Calculate frequency distribution word_dist = nltk.FreqDist(cleaned_data_title) hnhk = pd.DataFrame(word_dist.most_common(MAX_N), columns=['Word', 'Frequency']) compound = .0 for word in hnhk['Word'].head(MAX_N): compound += SentimentIntensityAnalyzer().polarity_scores( word)['compound'] polarities.append(compound) category_list = pd.DataFrame(category_list) polarities = pd.DataFrame(polarities) tags_sentiment = pd.concat([category_list, polarities], axis=1) tags_sentiment.columns = ['category', 'polarity'] tags_sentiment = tags_sentiment.sort_values('polarity').reset_index() plt.figure(figsize=(18, 10)) sns.barplot(x=tags_sentiment['polarity'], y=tags_sentiment['category'], data=tags_sentiment) plt.xlabel("Categories", fontsize=20) plt.ylabel("Polarity", fontsize=20) plt.yticks(fontsize=15) plt.xticks(fontsize=15) plt.title("\nPolarity of Different Categories videos\n", fontsize=25) __save_figure(plt, output_dir, 'polarity_of_categories.png') plt.close()