def main(): parser = argparse.ArgumentParser() parser.add_argument('--word2vec', type=str, required=True) parser.add_argument('--output_dir', type=str, required=True) args = parser.parse_args() df = pickle.load(open(args.word2vec, 'rb')) words_map = {v:k for k,v in enumerate(df['word'].values)} vectors_matrix = np.array([list(x) for x in df['vector'].values]) make_sure_path_exists(args.output_dir) with open(os.path.join(args.output_dir, 'words_map.json'), 'w') as f: json.dump(words_map, f) with open(os.path.join(args.output_dir, 'vectors_matrix.pkl'), 'wb') as f: pickle.dump(vectors_matrix, f)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataframes', required=True, type=str, help='The pickle file containing the mapping between ' 'term ids and the renormalised ' 'dataframes.') parser.add_argument('--terms', required=True, type=str, help='The JSON file containing the mapping from terms to ' 'term ids.') parser.add_argument('--output_dir', required=True, type=str, help='The output directory.') args = parser.parse_args() df_dict = pickle.load(open(args.dataframes, 'rb')) terms_dict = json.load(open(args.terms, 'r', encoding='utf8')) result_dict = {x: df_dict[terms_dict[x]].rename(columns={terms_dict[x]: x}) for x in terms_dict} make_sure_path_exists(args.output_dir) with open(os.path.join(args.output_dir, 'final_df_dict.pkl'), 'wb') as f: pickle.dump(result_dict, f)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--page_files', type=str, required=True) parser.add_argument('--keep_all_updates', action='store_true') parser.add_argument('--save_originals', action='store_true') parser.add_argument('--output_dir', type=str, required=True) args = parser.parse_args() list_of_filenames = [os.path.join(args.page_files, x) for x in os.listdir(args.page_files)] all_pages = crawl_entire_page(list_of_filenames) parsed_ads = parse_all_pages(all_pages, args.keep_all_updates) make_sure_path_exists(args.output_dir) with open(os.path.join(args.output_dir, 'all_ads.json'), 'w') as f: json.dump(parsed_ads, f) if args.save_originals: with open(os.path.join(args.output_dir, 'jobs_rss_raw.json'), 'w') as f: json.dump(all_pages, f)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--output_dir', type=str, required=True) parser.add_argument('--crawl', action='store_true') parser.add_argument('--translate', action='store_true') args = parser.parse_args() if args.crawl or not args.translate: jobs_df = get_all_vocations(STARTING_URL, BASE_URL) print('crawling jobs completed') print(jobs_df.head()) make_sure_path_exists(args.output_dir) with open(os.path.join(args.output_dir, 'skills_fr.pkl'), 'wb') as f: pickle.dump(jobs_df, f) if args.translate: if not args.crawl: jobs_df = pickle.load( open(os.path.join(args.output_dir, 'skills_fr.pkl'), 'rb')) print('starting translation') translated_df = translate_df(jobs_df) with open(os.path.join(args.output_dir, 'skills_en.pkl'), 'wb') as f: pickle.dump(translated_df, f)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--posts', type=str, required=True) parser.add_argument('--num_top', type=int, default=1) parser.add_argument('--output_dir', type=str, required=True) args = parser.parse_args() conf = SparkConf().set("spark.driver.maxResultSize", "10G"). \ set("spark.hadoop.validateOutputSpecs", "false"). \ set('spark.default.parallelism', '400') spark = SparkSession.builder.\ appName("SO Tag first usage date").\ config(conf=conf).\ getOrCreate() sc = spark.sparkContext in_rdd = sc.textFile(args.posts).filter(lambda x: get_field(x, 'Id') is not None).\ map(lambda x: (int(get_field(x, 'Id')), x)) in_rdd = in_rdd.filter(lambda x: get_field(x[1], 'Tags') is not None and get_field(x[1], 'CreationDate') is not None).\ map(lambda x: (datetime.strptime(get_field(x[1], 'CreationDate').decode('utf-8'), DT_FORMAT), get_tags(get_field(x[1], 'Tags').decode('utf-8')))).\ flatMap(lambda x: [(x[0], y) for y in x[1]]) tag_date_df = in_rdd.toDF(['CreationDate', 'Tag']) window = Window.partitionBy(tag_date_df['Tag']).orderBy(tag_date_df['CreationDate'].asc()) #tag_first_appearances = tag_date_df.groupBy('Tag').agg({'CreationDate': 'min'}) tag_first_appearances = tag_date_df.select('*', rank().over(window).alias('rank')).\ filter(col('rank') <= args.num_top) tag_first_appearances_pd = tag_first_appearances.toPandas().drop(columns=['rank']) make_sure_path_exists(args.output_dir) with open(os.path.join(args.output_dir, 'tag_'+str(args.num_top)+'_earliest_appearance.csv'), 'w') as f: tag_first_appearances_pd.to_csv(f)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--mode', choices=['continue', 'start'], required=True, default='start', help='The mode of operation. If you were running this script before and ' 'were interrupted by an error (e.g. rate-limiting), use continue, ' 'otherwise use start.') parser.add_argument('--dataframes', required=True, type=str, help='Pickle file containing a term id to dataframe ' 'dictionary. The dataframes are all normalised ' 'to have a maximum of 100.') parser.add_argument( '--time_settings', type=str, help='The JSON file containing the start and end times of the ' 'time period you are using.') parser.add_argument( '--state', type=str, help='Only for continue mode, the saved state file to load.') parser.add_argument( '--proxy', type=str, help='Proxy server address if you need to use one. Needs to be HTTPS.') parser.add_argument('--sleep_time', type=int, default=1, help='Sleep time between subsequent queries, to ' 'avoid rate-limiting. If you\'re rate-limited, ' 'set this to 60 (unit is seconds).') parser.add_argument('--output_dir', type=str, required=True, help='Output directory for the resulting pickle file.') parser.add_argument( '--starting_term', type=str, help='The term to start from in the renormalisation (which will ' 'be the term that has the maximum value of 100 in the end). ' 'If not provided, the starting term will be random.') parser.add_argument( '--terms', type=str, help='If you want to start from a specific term, this JSON needs to be ' 'provided in order to map the term to its term id in Google Trends.') args = parser.parse_args() if args.mode == 'continue' and (args.state is None or args.time_settings is not None or args.starting_term is not None or args.terms is not None): parser.error( 'In "continue" mode, you should provide a pickle file containing the saved state.' ) if args.mode == 'start' and (args.state is not None or args.time_settings is None): parser.error( 'In "start" mode, you should provide a time settings json.') if args.starting_term is not None and args.terms is None: parser.error( 'When you provide the starting term, you need to provide the dictionary mapping them to their ' 'term ids.') if args.proxy is not None: proxy = {'https': args.proxy} else: proxy = None pytrends_obj = create_pytrends_obj(proxies=proxy, sleep_time=args.sleep_time) term_dataframe_dict = pickle.load(open(args.dataframes, 'rb')) terms_list = list(term_dataframe_dict.keys()) if args.mode == 'start': settings_dict = json.load( open(args.time_settings, 'r', encoding='utf8')) starting_term = None if args.starting_term is not None: starting_term = args.starting_term term_to_mid = json.load(open(args.terms, 'r')) starting_term = term_to_mid[starting_term] conversion_ratio_list = find_all_interterm_conversion_rates_start( pytrends_obj, terms_list, settings_dict['time_start'], settings_dict['time_end'], starting_term=starting_term) else: saved_state = pickle.load(open(args.state, 'rb')) conversion_ratio_list = find_all_interterm_conversion_rates_continue( pytrends_obj, saved_state[0], saved_state[1], saved_state[2], saved_state[3], saved_state[4], saved_state[5], saved_state[6], saved_state[7], saved_state[8], saved_state[9], saved_state[10]) if conversion_ratio_list is not None: renormalisation_dict = compile_final_renormalisation_ratios( conversion_ratio_list, terms_list) renormalise_all_tags(term_dataframe_dict, renormalisation_dict) make_sure_path_exists(args.output_dir) with open(os.path.join(args.output_dir, 'renormalised_df_dict.pkl'), 'wb') as f: pickle.dump(term_dataframe_dict, f) with open(os.path.join(args.output_dir, 'conversion_ratios.pkl'), 'wb') as f: pickle.dump(conversion_ratio_list, f) else: print( 'If you have been rate-limited, increasing the sleep time to 60 seconds should do the trick!' )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--term_list', required=True, type=str, help='A txt file containing one term per line.') parser.add_argument('--output_dir', required=True, type=str, help='Output directory for the resulting dictionary.') parser.add_argument('--use_original_terms', action='store_true', help='Whether to just use the original terms ' 'and to avoid going through the suggestions. ' 'Not recommended.') parser.add_argument( '--choose_first', action='store_true', help='If you don\'t feel like going through the entire ' 'list of suggestions for each term, use this ' 'option to always select the first one. ' 'Not recommended.') parser.add_argument( '--proxy', type=str, help='Proxy server address if you need to use one. Needs to be HTTPS.') parser.add_argument('--sleep_time', type=int, default=0, help='Sleep time between subsequent queries, to ' 'avoid rate-limiting. If you\'re rate-limited, ' 'set this to 60 (unit is seconds).') args = parser.parse_args() if args.use_original_terms and args.choose_first: parser.error( '--use_original_terms and --choose_first are mutually exclusive') terms = open(args.term_list, 'r', encoding='utf8').readlines() terms = [x.strip() for x in terms if len(x.strip()) > 0] if args.use_original_terms: terms_dict = {x: x for x in terms} else: if args.proxy is not None: proxy = {'https': args.proxy} else: proxy = None pytrends_obj = create_pytrends_obj(proxies=proxy, sleep_time=args.sleep_time) terms_dict = dict() for term in terms: chosen_term = prompt_term_choice(pytrends_obj, term, default_choice=args.choose_first) if chosen_term is None: break if chosen_term != '': terms_dict[term] = chosen_term make_sure_path_exists(args.output_dir) with open(os.path.join(args.output_dir, 'term_to_mid.json'), mode='w', encoding='utf8') as f: json.dump(terms_dict, f)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--mode', choices=['continue', 'start'], required=True, default='start', help='The mode of operation. If you were running this script before and ' 'were interrupted by an error (e.g. rate-limiting), use continue, ' 'otherwise use start.') parser.add_argument('--time_start', type=str, help='Starting point of the time period you want.') parser.add_argument('--time_end', type=str, help='Ending point of the time period you want.') parser.add_argument( '--terms', type=str, help='The JSON file containing the mapping from terms to term ids.') parser.add_argument( '--state', type=str, help='Only for continue mode, the saved state file to load.') parser.add_argument( '--proxy', type=str, help='Proxy server address if you need to use one. Needs to be HTTPS.') parser.add_argument('--sleep_time', type=int, default=1, help='Sleep time between subsequent queries, to ' 'avoid rate-limiting. If you\'re rate-limited, ' 'set this to 60 (unit is seconds).') parser.add_argument('--leap_size', type=int, default=1) parser.add_argument('--output_dir', type=str, required=True, help='Output directory for the resulting pickle file.') args = parser.parse_args() if args.mode == 'continue' and (args.terms is not None or args.state is None): parser.error( 'In "continue" mode, you should provide a pickle file containing the saved state.' ) if args.mode == 'start' and (args.terms is None or args.state is not None or args.time_start is None or args.time_end is None): parser.error( 'In "start" mode, you should provide a json file mapping terms to their term ids ("mid"s), ' 'in addition to the start and end times.') if args.proxy is not None: proxy = {'https': args.proxy} else: proxy = None pytrends_obj = create_pytrends_obj(proxies=proxy, sleep_time=args.sleep_time) if args.mode == 'start': terms_dict = json.load(open(args.terms, 'r', encoding='utf8')) terms_list = list(terms_dict.values()) with open(os.path.join(args.output_dir, 'dataframe_settings.json'), 'w', encoding='utf8') as f: json.dump( { 'time_start': args.time_start, 'time_end': args.time_end }, f) df_dict = retrieve_all_terms_start(pytrends_obj, terms_list, args.time_start, args.time_end, leap_size=args.leap_size) else: saved_state = pickle.load(open(args.state, 'rb')) df_dict = retrieve_all_terms_continue(pytrends_obj, saved_state[0], saved_state[1], saved_state[2], saved_state[3], saved_state[4], leap_size=saved_state[5]) if df_dict is not None: make_sure_path_exists(args.output_dir) with open(os.path.join(args.output_dir, 'individual_df_dict.pkl'), 'wb') as f: pickle.dump(df_dict, f) else: print( 'If you have been rate-limited, increasing the sleep time to 60 seconds should do the trick!' )