def preprocess_one_day_tweet(tweet_path): '''pathで指定されるjsonファイルを読んでツイートのDataFrameを作成する tweet_path: str NOTE ---- tweet_path[-15:-5]中にある日付を保存する際の名前にしている 日付の位置が違う場合は適宜変更のこと ''' date = tweet_path[-15:-5] ### Twieetの読み込み with timer(f"reading {date}", LOGGER): tweet_list = [] with open(tweet_path) as f: l = f.readline() while l: l = f.readline() try: if len(l) == 0: break tweet = preprocess_tweet(l) tweet_dict = ast.literal_eval(tweet) tweet_dict["body"] = tweet_dict["body"].encode('utf-16', 'surrogatepass').decode('utf-16') tweet_list.append(tweet_dict) except: continue tweet_df = pd.DataFrame(tweet_list) tweet_df = tweet_df[["body", "created_at"]] tweet_list = None # 不必要な変数を削除 ### Tweet分の前処理 # ノイズを処理 with timer(f"cleaning {date}", LOGGER): # tweet_df["source_url"] = tweet_df["source_url"]\ # .map(lambda x: x.encode('utf-16','surrogatepass').decode('utf-16')) tweet_df["body"] = tweet_df["body"].map(lambda x: remove_noise(x)) # 標準化 with timer(f"normalizing {date}", LOGGER): tweet_df["body"] = tweet_df["body"].map(lambda x: neologdn.normalize(x, repeat=3)) # 分かち書き with timer(f"tokenizing {date}", LOGGER): m = MeCab.Tagger(f"-Owakati -d {MeCab_DICT_PATH}") tweet_df["body"] = tweet_df["body"].map(lambda x: m.parse(x)) # 保存 with open(PREPROCESSED_DATA_PATH+TXT_DATA_NAME+"/"+date+".pickle", mode="wb") as f: pickle.dump(tweet_df, f) return None
def check_one_day_word(tweet_path): '''1文書中に含まれる単語のsetを保存する tweet_path: str tweet(文書)のDataFrameが保存されているPath DataFrameには文書が入っているカラム"body"が必要 NOTE ---- tweet_path[-17:-7]中にある日付を保存する際の名前にしている 日付の位置が違う場合は適宜変更のこと ''' date = tweet_path[-17:-7] with timer(f"check one day word {date}", LOGGER): with open(tweet_path, mode="rb") as f: df = pickle.load(f) tweets = df["body"].values del df #不要な変数を削除 tweets = " ".join(tweets).lower() tweets tweets = tweets.split(" ") word_set = set(tweets) with open(PREPROCESSED_DATA_PATH + "word_sets/" + date + ".pickle", mode="wb") as f: pickle.dump(word_set, f, protocol=-1) return
def make_one_day_ppmi_list(path_tuple): '''各時刻でのPPMIを計算する based on eq (1) from https://arxiv.org/pdf/1703.00607.pdf Prams: ------ path_tuple: (tweet_path, co_occ_path) tweet_path: str tweet(文書)のDataFrameが保存されているPath DataFrameには文書が入っているカラム"body"が必要 co_occ_path: str 単語の共起連結リストと単語の出現頻度 arrayのtuple が保存されているpath Return: ------ None PPMIのarrayを保存する NOTE ---- - tweet_path[-17:-7]中にある日付を保存する際の名前にしている 日付の位置が違う場合は適宜変更のこと - WORD_FREQ_MINによって対象単語が制限される ''' tweet_path, co_occ_path = path_tuple date = tweet_path[-17:-7] with timer(f"load data {date}", LOGGER): with open(co_occ_path, mode="rb") as f: co_occ_dict, word_count = pickle.load(f) # 単語とidxのマッピングをロード with open(PREPROCESSED_DATA_PATH + "filtered_word2idx.pickle", mode="rb") as f: filtered_word2idx = pickle.load(f) with do_job(f"calc ppmi_list {date}", LOGGER): # |D| : total number of tokens in corpus D = get_number_of_tokens(tweet_path) ppmi_list = [] for target_word, target_word_idx in filtered_word2idx.items(): cnt = co_occ_dict.pop(target_word) for co_occ_word_idx, co_occ_freq in cnt.items(): # 出現頻度の低い単語を無視 ppmi = calc_ppmi(co_occ_freq, word_count, target_word_idx, co_occ_word_idx, D) if ppmi > 0: # sparse matrixを作成するため0以上のみ保存 ppmi_list.append([ppmi, target_word_idx, co_occ_word_idx]) with open(PREPROCESSED_DATA_PATH + "ppmi_list/" + date + ".pickle", mode="wb") as f: pickle.dump(np.array(ppmi_list), f, protocol=-1) return
for match in bad_intel: for x in match: # Because "match" is a tuple if x != '': # If the value isn't empty intel.add(x) for url in external: try: if top_level(url, fix_protocol=True) in intels: intel.add(url) except: pass # Records the time at which crawling stopped now = time.time() # Finds total time taken diff = (now - then) minutes, seconds, time_per_request = timer(diff, processed) # Step 4. Save the results if not os.path.exists(output_dir): # if the directory doesn't exist os.mkdir(output_dir) # create a new directory datasets = [ files, intel, robots, custom, failed, internal, scripts, external, fuzzable, endpoints, keys ] dataset_names = [ 'files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys' ] writer(datasets, dataset_names, output_dir)
for match in bad_intel: for x in match: # 因为 match 是一个元组 if x != '': # 如果这个值不为空 intel.add(x) for url in external: try: if top_level(url, fix_protocol=True) in intels: intel.add(url) except: pass # 记录下爬取停止的时间 now = time.time() # 计算总耗费时间 diff = (now - then) minutes, seconds, time_per_request = timer( diff, processed) # 这里涉及到 core/utils.py 文件中的 timer() 函数 # 第四步:保存这个结果 if not os.path.exists(output_dir): # 如果这个文件夹不存在 os.mkdir(output_dir) # 创建新的文件夹 datasets = [ files, intel, robots, custom, failed, internal, scripts, external, fuzzable, endpoints, keys ] dataset_names = [ 'files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys' ] writer(datasets, dataset_names,
def make_one_day_co_occ_dict(tweet_path, window_size=21): '''ある日の単語の共起と単語の出現回数を保存する Params ------ tweet_path: str tweet(文書)のDataFrameが保存されているPath DataFrameには文書が入っているカラム"body"が必要 window size: int 共起をカウントする検索幅、自身+前後の単語の総数 奇数推奨 Return ------ None (単語の共起の隣接リスト, 単語の出現回数 array)のtuple を保存する NOTE ---- tweet_path[-17:-7]中にある日付を保存する際の名前にしている 日付の位置が違う場合は適宜変更のこと ''' date = tweet_path[-17:-7] with open(PREPROCESSED_DATA_PATH + "filtered_word2idx.pickle", mode="rb") as f: filtered_word2idx = pickle.load(f) filtered_word_num = len(filtered_word2idx) with timer(f"load {date} data", LOGGER): # ツイートデータの呼び出し with open(tweet_path, mode="rb") as f: df = pickle.load(f) tweets = df["body"].values del df #不要な変数を削除 # 単語の共起を記録 with do_job(f"make co_occ_dict {date}", LOGGER): co_occ_dict = {w: {} for w in filtered_word2idx.keys()} word_count = np.zeros(filtered_word_num) for tweet in tweets: splited_tweet = tweet.lower().split(" ") tweet_len = len(splited_tweet) for i, w in enumerate(splited_tweet): w_idx = filtered_word2idx.get(w) if w_idx: word_count[w_idx] += 1 else: continue # 単語の検索範囲を指定 window_radius = int((window_size - 1) / 2) first = i - window_radius if first < 0: first = 0 last = i + window_radius if last > tweet_len: last = tweet_len # 共起語のidを保存 for word_idx in range(first, last): co_occ_word = splited_tweet[word_idx] co_occ_idx = filtered_word2idx.get(co_occ_word) if co_occ_idx: if co_occ_dict[w].get(co_occ_idx): co_occ_dict[w][co_occ_idx] += 1 else: co_occ_dict[w][co_occ_idx] = 1 #不要な変数を削除 del splited_tweet # 保存 save_path = PREPROCESSED_DATA_PATH + "co_occ_dict_word_count/" + date + ".pickle" with open(save_path, mode="wb") as f: pickle.dump((co_occ_dict, word_count), f, protocol=-1) return
def make_whole_day_co_occ_dict(TWEETS_PATHS, window_size=21): '''単語の共起と単語の出現回数を保存する Params ------ TWEETS_PATHS: [str, str, ...] tweet(文書)のDataFrameが保存されているPathのリスト DataFrameには文書が入っているカラム"body"が必要 時系列にそってsort済みであることが必要 window_size: int 共起をカウントする検索幅、自身+前後の単語の総数 奇数推奨 Return ------ None (単語の共起の隣接リスト, 単語の出現回数 array)のtuple を保存して行く NOTE ---- tweet_path[-17:-7]中にある日付を保存する際の名前にしている 日付の位置が違う場合は適宜変更のこと - WORD_FREQ_MINによって対象単語が制限される ''' ### 対象単語を制限 with timer(f"filter words", LOGGER): # データを読み込む with open(PREPROCESSED_DATA_PATH + "unique_word2idx.pickle", mode="rb") as f: word2idx = pickle.load(f) unique_word_num = len(word2idx.keys()) # カウント word_count = np.zeros(unique_word_num) for tweet_path in TWEETS_PATHS[-7:]: with open(tweet_path, mode="rb") as f: df = pickle.load(f) tweets = df["body"].values del df for tweet in tweets: splited_tweet = tweet.lower().split(" ") for w in splited_tweet: try: word_count[word2idx[w]] += 1 except: continue # word -> idxのマッピングで制限 idxconverter = {} new_idx = 0 for old_idx in word2idx.values(): if word_count[old_idx] >= WORD_FREQ_MIN: idxconverter[old_idx] = new_idx new_idx += 1 # word2idxの縮小版を作る filtered_word2idx = {} for word, old_idx in word2idx.items(): try: filtered_word2idx[word] = idxconverter[old_idx] except KeyError: continue # 保存 with open(PREPROCESSED_DATA_PATH + "filtered_word2idx.pickle", mode="wb") as f: pickle.dump(filtered_word2idx, f, protocol=-1) # 不要な変数を削除 del word2idx, word_count, idxconverter if not os.path.exists(PREPROCESSED_DATA_PATH + "co_occ_dict_word_count/"): os.mkdir(PREPROCESSED_DATA_PATH + "co_occ_dict_word_count/") with Pool(processes=N_JOB) as p: p.map(make_one_day_co_occ_dict, TWEETS_PATHS) return
def post(self, target, level_): global keys global files global intel global robots global custom global failed global scripts global external global fuzzable global endpoints global processed global internal global main_url global delay global cook global headers global timeout global host global proxies global user_agents global only_urls global bad_intel global bad_scripts global clone global schema global args global supress_regex global results results = {} # Disable SSL related warnings warnings.filterwarnings('ignore') # Processing command line arguments parser = argparse.ArgumentParser() # Options parser.add_argument('-u', '--url', help='root url', dest='root') parser.add_argument('-c', '--cookie', help='cookie', dest='cook') parser.add_argument('-r', '--regex', help='regex pattern', dest='regex') parser.add_argument('-e', '--export', help='export format', dest='export', choices=['csv', 'json']) parser.add_argument('-o', '--output', help='output directory', dest='output') parser.add_argument('-l', '--level', help='levels to crawl', dest='level', type=int) parser.add_argument('-t', '--threads', help='number of threads', dest='threads', type=int) parser.add_argument('-d', '--delay', help='delay between requests', dest='delay', type=float) parser.add_argument('-v', '--verbose', help='verbose output', dest='verbose', action='store_true') parser.add_argument('-s', '--seeds', help='additional seed URLs', dest='seeds', nargs="+", default=[]) parser.add_argument('--stdout', help='send variables to stdout', dest='std') parser.add_argument('--user-agent', help='custom user agent(s)', dest='user_agent') parser.add_argument('--exclude', help='exclude URLs matching this regex', dest='exclude') parser.add_argument('--timeout', help='http request timeout', dest='timeout', type=float) parser.add_argument('-p', '--proxy', help='Proxy server IP:PORT or DOMAIN:PORT', dest='proxies', type=proxy_type) # Switches parser.add_argument('--clone', help='clone the website locally', dest='clone', action='store_true') parser.add_argument('--headers', help='add headers', dest='headers', action='store_true') parser.add_argument('--dns', help='enumerate subdomains and DNS data', dest='dns', action='store_true') parser.add_argument('--keys', help='find secret keys', dest='api', action='store_true') parser.add_argument('--update', help='update photon', dest='update', action='store_true') parser.add_argument('--only-urls', help='only extract URLs', dest='only_urls', action='store_true') parser.add_argument('--wayback', help='fetch URLs from archive.org as seeds', dest='archive', action='store_true') args = parser.parse_args() print("------------------------------------------------") print(args.root) print(type(args.level)) print(type(args.threads)) print(args.api) print(args.archive) print(args.export) args.root = "http://" + target args.level = int(level_) args.threads = 30 args.api = True args.archive = True args.export = "json" # If the user has supplied --update argument if args.update: updater() quit() # If the user has supplied a URL if args.root: main_inp = args.root if main_inp.endswith('/'): # We will remove it as it can cause problems later in the code main_inp = main_inp[:-1] # If the user hasn't supplied an URL else: print('\n' + parser.format_help().lower()) quit() clone = args.clone headers = args.headers # prompt for headers verbose = args.verbose # verbose output delay = args.delay or 0 # Delay between requests timeout = args.timeout or 6 # HTTP request timeout cook = args.cook or None # Cookie api = bool( args.api) # Extract high entropy strings i.e. API keys and stuff proxies = [] if args.proxies: print("%s Testing proxies, can take a while..." % info) for proxy in args.proxies: if is_good_proxy(proxy): proxies.append(proxy) else: print("%s Proxy %s doesn't seem to work or timedout" % (bad, proxy['http'])) print("%s Done" % info) if not proxies: print("%s no working proxies, quitting!" % bad) exit() else: proxies.append(None) crawl_level = args.level or 2 # Crawling level thread_count = args.threads or 2 # Number of threads only_urls = bool(args.only_urls) # Only URLs mode is off by default # Variables we are gonna use later to store stuff keys = set() # High entropy strings, prolly secret keys files = set() # The pdf, css, png, etc files. intel = set( ) # The email addresses, website accounts, AWS buckets etc. robots = set() # The entries of robots.txt custom = set() # Strings extracted by custom regex pattern failed = set() # URLs that photon failed to crawl scripts = set() # THe Javascript files external = set( ) # URLs that don't belong to the target i.e. out-of-scope # URLs that have get params in them e.g. example.com/page.php?id=2 fuzzable = set() endpoints = set() # URLs found from javascript files processed = set(['dummy']) # URLs that have been crawled # URLs that belong to the target i.e. in-scope internal = set(args.seeds) everything = [] bad_scripts = set() # Unclean javascript file urls bad_intel = set() # needed for intel filtering core.config.verbose = verbose if headers: try: prompt = prompt() except FileNotFoundError as e: print('Could not load headers prompt: {}'.format(e)) quit() headers = extract_headers(prompt) # If the user hasn't supplied the root URL with http(s), we will handle it if main_inp.startswith('http'): main_url = main_inp else: try: requests.get('https://' + main_inp, proxies=random.choice(proxies)) main_url = 'https://' + main_inp except: main_url = 'http://' + main_inp schema = main_url.split('//')[0] # https: or http:? # Adding the root URL to internal for crawling internal.add(main_url) # Extracts host out of the URL host = urlparse(main_url).netloc output_dir = args.output or host output_dir = "results" try: domain = top_level(main_url) except: domain = host if args.user_agent: user_agents = args.user_agent.split(',') else: with open(sys.path[0] + '/core/user-agents.txt', 'r') as uas: user_agents = [agent.strip('\n') for agent in uas] supress_regex = False # Records the time at which crawling started then = time.time() # Step 1. Extract urls from robots.txt & sitemap.xml zap(main_url, args.archive, domain, host, internal, robots, proxies) # This is so the level 1 emails are parsed as well internal = set(remove_regex(internal, args.exclude)) # Step 2. Crawl recursively to the limit specified in "crawl_level" for level in range(crawl_level): # Links to crawl = (all links - already crawled links) - links not to crawl links = remove_regex(internal - processed, args.exclude) # If links to crawl are 0 i.e. all links have been crawled if not links: break # if crawled links are somehow more than all links. Possible? ;/ elif len(internal) <= len(processed): if len(internal) > 2 + len(args.seeds): break print('%s Level %i: %i URLs' % (run, level + 1, len(links))) try: flash(self.extractor, links, thread_count) except KeyboardInterrupt: print('') break if not only_urls: for match in bad_scripts: if match.startswith(main_url): scripts.add(match) elif match.startswith('/') and not match.startswith('//'): scripts.add(main_url + match) elif not match.startswith('http') and not match.startswith( '//'): scripts.add(main_url + '/' + match) # Step 3. Scan the JavaScript files for endpoints print('%s Crawling %i JavaScript files' % (run, len(scripts))) flash(self.jscanner, scripts, thread_count) for url in internal: if '=' in url: fuzzable.add(url) for match, intel_name, url in bad_intel: if isinstance(match, tuple): for x in match: # Because "match" is a tuple if x != '': # If the value isn't empty if intel_name == "CREDIT_CARD": if not luhn(match): # garbage number continue intel.add("%s:%s" % (intel_name, x)) else: if intel_name == "CREDIT_CARD": if not luhn(match): # garbage number continue intel.add("%s:%s:%s" % (url, intel_name, match)) for url in external: try: if top_level(url, fix_protocol=True) in INTELS: intel.add(url) except: pass # Records the time at which crawling stopped now = time.time() # Finds total time taken diff = (now - then) minutes, seconds, time_per_request = timer(diff, processed) # Step 4. Save the results if not os.path.exists(output_dir): # if the directory doesn't exist os.mkdir(output_dir) # create a new directory datasets = [ files, intel, robots, custom, failed, internal, scripts, external, fuzzable, endpoints, keys ] dataset_names = [ 'files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys' ] writer(datasets, dataset_names, output_dir) # Printing out results print(('%s-%s' % (red, end)) * 50) for dataset, dataset_name in zip(datasets, dataset_names): if dataset: print('%s %s: %s' % (good, dataset_name.capitalize(), len(dataset))) print(('%s-%s' % (red, end)) * 50) print('%s Total requests made: %i' % (info, len(processed))) print('%s Total time taken: %i minutes %i seconds' % (info, minutes, seconds)) print('%s Requests per second: %i' % (info, int(len(processed) / diff))) datasets = { 'files': list(files), 'intel': list(intel), 'robots': list(robots), 'custom': list(custom), 'failed': list(failed), 'internal': list(internal), 'scripts': list(scripts), 'external': list(external), 'fuzzable': list(fuzzable), 'endpoints': list(endpoints), 'keys': list(keys) } if args.dns: print('%s Enumerating subdomains' % run) from plugins.find_subdomains import find_subdomains subdomains = find_subdomains(domain) print('%s %i subdomains found' % (info, len(subdomains))) writer([subdomains], ['subdomains'], output_dir) datasets['subdomains'] = subdomains from plugins.dnsdumpster import dnsdumpster print('%s Generating DNS map' % run) dnsdumpster(domain, output_dir) if args.export: from plugins.exporter import exporter # exporter(directory, format, datasets) results = datasets exporter(output_dir, args.export, datasets) print('%s Results saved in %s%s%s directory' % (good, green, output_dir, end)) if args.std: for string in datasets[args.std]: sys.stdout.write(string + '\n') return results, 200
if not luhn(match): # garbage number continue intel.add("%s:%s:%s" % (url, intel_name, match)) for url in external: try: if top_level(url, fix_protocol=True) in INTELS: intel.add(url) except: pass # Records the time at which crawling stopped now = time.time() # Finds total time taken diff = (now - then) minutes, seconds, time_per_request = timer(diff, processed) # Step 4. Save the results if not os.path.exists(output_dir): # if the directory doesn't exist os.mkdir(output_dir) # create a new directory datasets = [files, intel, robots, custom, failed, internal, scripts, external, fuzzable, endpoints, keys] dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys'] writer(datasets, dataset_names, output_dir) # Printing out results print(('%s-%s' % (red, end)) * 50) for dataset, dataset_name in zip(datasets, dataset_names): if dataset:
result['urls']['internal'] = set(args.seeds) load_modules('before-crawling') parsed_url = urlparse(var['input_url']) root_url = parsed_url.scheme + '://' + parsed_url.netloc var['root_url'] = root_url result['urls']['internal'].add(var['root_url']) zap() # parse sitemap.xml and robots.txt try: crawler() except KeyboardInterrupt: print('%s Crawler stopped' % info) now = time.time() # record ending time diff = (now - then) # total time taken minutes, seconds, time_per_request = timer(diff, var['processed']) result['data']['files'] = list(result['data']['files']) result['urls']['failed'] = list(result['urls']['failed']) result['data']['websites'] = list(result['data']['websites']) result['urls']['internal'] = list(result['urls']['internal']) result['urls']['external'] = list(result['urls']['external']) result['urls']['fuzzable'] = list(result['urls']['fuzzable']) if 'token' in result['data']['extractors']: valid_tokens = {} for token, url in result['data']['extractors']['token'].items(): is_valid = is_token(token) if is_valid: valid_tokens[token] = url result['data']['extractors']['token'] = valid_tokens