def test_creds_default_location_beneath_home(self): input_ = 'scrape_user -u test_user --all'.split() parsed = self.parser.parse_args(input_) parsed = parse_scraper_args(parsed, self.parser) self.assertEqual(parsed.creds, self.default_creds)
def test_output_dir_default(self): input_ = 'scrape_user -u test_user --all'.split() parsed = self.parser.parse_args(input_) parsed = parse_scraper_args(parsed, self.parser) self.assertEqual(parsed.output_dir, os.getcwd())
def test_implicit_mode_defaults_to_all_choices(self): in1 = 'scrape_user -u test_user --all'.split() in2 = 'scrape_user -u test_user --start 2020-01-01 --end 2020-02-01'.split( ) in3 = 'scrape_publication -u test_pub --all'.split() in4 = 'scrape_publication -u test_pub --start 2020-01-01 --end 2020-02-01'.split( ) parsed_user = [self.parser.parse_args(i) for i in [in1, in2]] parsed_user = [parse_scraper_args(p, self.parser) for p in parsed_user] for p in parsed_user: self.assertEqual(p.mode, USER_MODE_CHOICES) parsed_pub = [self.parser.parse_args(i) for i in [in3, in4]] parsed_pub = [parse_scraper_args(p, self.parser) for p in parsed_pub] for p in parsed_pub: self.assertEqual(p.mode, PUB_MODE_CHOICES)
def test_period_end_defaults_to_most_recent_full_day_utc(self): input_ = 'scrape_user -u test_user --start 2020-01-01'.split() now = datetime.now(timezone.utc) now = datetime(*now.timetuple()[:3]).replace(tzinfo=timezone.utc) args = self.parser.parse_args(input_) args = parse_scraper_args(args, self.parser) self.assertEqual(args.end, now)
def test_period_start_defaults_beginning_day_prior_to_end(self): input_d = 'scrape_user -u test_user --end 2020-02-01'.split() input_dt = 'scrape_user -u test_user --end 2020-02-01T12:00:00'.split() start = datetime.strptime('2020-01-31', '%Y-%m-%d') start = start.replace(tzinfo=timezone.utc) for i in [input_d, input_dt]: args = self.parser.parse_args(i) args = parse_scraper_args(args, self.parser) self.assertEqual(args.start, start)
def test_mode_flag_returns_list(self): input1 = 'scrape_user -u test_user --all --mode events'.split() input2 = 'scrape_user -u test_user --all --mode events referrers'.split( ) input3 = 'scrape_user -u test_user --start 2020-01-01 --end 2020-02-01'.split( ) parsed = [self.parser.parse_args(i) for i in [input1, input2, input3]] parsed = [parse_scraper_args(p, self.parser) for p in parsed] for p in parsed: self.assertIsInstance(p.mode, list)
def test_sid_and_uid_required_together_if_input(self): invalid_sid = 'scrape_user --sid foo -u test_user'.split() invalid_uid = 'scrape_user --uid bar -u test_user'.split() invalids = [invalid_sid, invalid_uid] invalids = [self.parser.parse_args(i) for i in invalids] for i in invalids: with self.assertRaises(SystemExit) as e: with capture_sys_output() as (out, err): _ = parse_scraper_args(i, self.parser) msg = err.getvalue() self.assertIn('both "sid" and "uid" arguments', msg)
def test_all_and_period_flags_mutually_exclusive(self): invalid1 = 'scrape_user -u test_user --all --start 2020-01-01'.split() invalid2 = 'scrape_user -u test_user --all --end 2020-01-01'.split() invalid3 = 'scrape_user -u test_user --all --start 2020-01-01 --end 2020-02-01'.split( ) invalids = [invalid1, invalid2, invalid3] invalids = [self.parser.parse_args(i) for i in invalids] for i in invalids: with self.assertRaises(SystemExit) as e: with capture_sys_output() as (out, err): _ = parse_scraper_args(i, self.parser) msg = err.getvalue() self.assertIn('Can\'t use "--all" flag with', msg)
def test_period_set(self): # TODO - this hits error because creds path needs to be mocked invalid_cred_explicit = 'scrape_user --creds ~/.medium_creds.ini -u test_user'.split( ) invalid_cred_implicit = 'scrape_user -u test_user'.split() invalids = [invalid_cred_explicit, invalid_cred_implicit] invalids = [self.parser.parse_args(i) for i in invalids] for i in invalids: with self.assertRaises(SystemExit) as e: with capture_sys_output() as (out, err): _ = parse_scraper_args(i, self.parser) msg = err.getvalue() self.assertIn('Period must be set', msg)
def test_period_flags_obey_correct_time_order(self): start = '2020-01-02' start_dt = '2020-01-02T00:00:00' end = '2020-01-01' end_dt = '2020-01-01T00:00:00' input1 = f'scrape_user -u test_user --start {start} --end {end}'.split( ) input2 = f'scrape_user -u test_user --start {start_dt} --end {end_dt}'.split( ) args_list = [self.parser.parse_args(i) for i in [input1, input2]] for a in args_list: with self.assertRaises(SystemExit) as e: with capture_sys_output() as (out, err): _ = parse_scraper_args(a, self.parser) msg = err.getvalue() self.assertIn('"--end" cannot be prior to "--start"', msg)
def main(): ## PARSE ARGS parser = get_argparser() args = parser.parse_args() # TODO - make this a plain path; not a directory argument command = args.command ## EXECUTE COMMANDS if command == 'fetch_cookies': from medium_stats.cookie_fetcher import MediumAuthorizer email, password = unpack_email_pwd(args) me = MediumAuthorizer(args.u, email, password) me.sign_in() me.save_cookies(args.creds) print(section_break) elif command in ['scrape_user', 'scrape_publication']: args = parse_scraper_args(args, parser) if args.creds: cfg = MediumConfigHelper(args.creds, args.u) sid, uid = cfg.sid, cfg.uid else: sid, uid = args.sid, args.uid modes = list(args.mode) get_folders = lambda x: [x[m]['folder'] for m in modes] print('\nGetting Preliminary Data...', end='\n\n') if command == 'scrape_user': username = args.u sg = StatGrabberUser(username, sid, uid, args.start, args.end, already_utc=True) folders = get_folders(user_mode_attrs) sub_dir = create_directories(args.output_dir, sg.slug, folders) # get summary stats to derive article_ids and user creation_time data = sg.get_summary_stats() articles = sg.get_article_ids(data) if 'summary' in modes: write_stats(sg, data, 'summary', sg.now, sub_dir) else: url = args.s sg = StatGrabberPublication(url, sid, uid, args.start, args.end, already_utc=True) folders = get_folders(pub_mode_attrs) sub_dir = create_directories(args.output_dir, sg.slug, folders) data = sg.get_all_story_overview() articles = sg.get_article_ids(data) if 'story_overview' in modes: write_stats(sg, data, 'story_overview', sg.now, sub_dir) # go through remainder of modes remaining = [ m for m in modes if m not in ('summary', 'story_overview') ] for m in remaining: if m == 'events': data = get_stats(sg, m, sg.now) else: data = get_stats(sg, m, sg.now, articles) write_stats(sg, data, m, sg.now, sub_dir) print('All done!')