예제 #1
0
def preprocess_one_day_tweet(tweet_path):
    '''pathで指定されるjsonファイルを読んでツイートのDataFrameを作成する
    tweet_path:  str
    NOTE
    ----
    tweet_path[-15:-5]中にある日付を保存する際の名前にしている
    日付の位置が違う場合は適宜変更のこと
    '''
    date = tweet_path[-15:-5]
    ### Twieetの読み込み
    with timer(f"reading {date}", LOGGER):
        tweet_list = []
        with open(tweet_path) as f:
            l = f.readline()
            while l:
                l = f.readline()
                try:
                    if len(l) == 0:
                        break
                    tweet = preprocess_tweet(l)
                    tweet_dict = ast.literal_eval(tweet)
                    tweet_dict["body"] = tweet_dict["body"].encode('utf-16',
                                                            'surrogatepass').decode('utf-16')
                    tweet_list.append(tweet_dict)
                except:
                    continue

        tweet_df = pd.DataFrame(tweet_list)
        tweet_df = tweet_df[["body", "created_at"]]
        tweet_list = None # 不必要な変数を削除


    ### Tweet分の前処理
    # ノイズを処理
    with timer(f"cleaning {date}", LOGGER):
        # tweet_df["source_url"] = tweet_df["source_url"]\
        #                             .map(lambda x: x.encode('utf-16','surrogatepass').decode('utf-16'))
        tweet_df["body"] = tweet_df["body"].map(lambda x: remove_noise(x))

    # 標準化
    with timer(f"normalizing {date}", LOGGER):
        tweet_df["body"] = tweet_df["body"].map(lambda x: neologdn.normalize(x, repeat=3))

    # 分かち書き
    with timer(f"tokenizing {date}", LOGGER):
        m = MeCab.Tagger(f"-Owakati -d {MeCab_DICT_PATH}")
        tweet_df["body"] = tweet_df["body"].map(lambda x: m.parse(x))

    # 保存
    with open(PREPROCESSED_DATA_PATH+TXT_DATA_NAME+"/"+date+".pickle", mode="wb") as f:
        pickle.dump(tweet_df, f)

    return None
예제 #2
0
def check_one_day_word(tweet_path):
    '''1文書中に含まれる単語のsetを保存する
    tweet_path:  str
    tweet(文書)のDataFrameが保存されているPath
    DataFrameには文書が入っているカラム"body"が必要
    NOTE
    ----
    tweet_path[-17:-7]中にある日付を保存する際の名前にしている
    日付の位置が違う場合は適宜変更のこと
    '''
    date = tweet_path[-17:-7]
    with timer(f"check one day word {date}", LOGGER):
        with open(tweet_path, mode="rb") as f:
            df = pickle.load(f)

        tweets = df["body"].values
        del df  #不要な変数を削除
        tweets = " ".join(tweets).lower()
        tweets
        tweets = tweets.split(" ")

        word_set = set(tweets)

        with open(PREPROCESSED_DATA_PATH + "word_sets/" + date + ".pickle",
                  mode="wb") as f:
            pickle.dump(word_set, f, protocol=-1)
    return
예제 #3
0
def make_one_day_ppmi_list(path_tuple):
    '''各時刻でのPPMIを計算する
    based on eq (1) from
    https://arxiv.org/pdf/1703.00607.pdf
    Prams:
    ------
    path_tuple: (tweet_path, co_occ_path)
        tweet_path:  str
        tweet(文書)のDataFrameが保存されているPath
        DataFrameには文書が入っているカラム"body"が必要
        co_occ_path:  str
        単語の共起連結リストと単語の出現頻度 arrayのtuple
        が保存されているpath
    Return:
    ------
    None
    PPMIのarrayを保存する
    NOTE
    ----
    - tweet_path[-17:-7]中にある日付を保存する際の名前にしている
      日付の位置が違う場合は適宜変更のこと
    - WORD_FREQ_MINによって対象単語が制限される
    '''
    tweet_path, co_occ_path = path_tuple
    date = tweet_path[-17:-7]

    with timer(f"load data {date}", LOGGER):
        with open(co_occ_path, mode="rb") as f:
            co_occ_dict, word_count = pickle.load(f)

        # 単語とidxのマッピングをロード
        with open(PREPROCESSED_DATA_PATH + "filtered_word2idx.pickle",
                  mode="rb") as f:
            filtered_word2idx = pickle.load(f)

    with do_job(f"calc ppmi_list {date}", LOGGER):
        # |D| : total number of tokens in corpus
        D = get_number_of_tokens(tweet_path)
        ppmi_list = []
        for target_word, target_word_idx in filtered_word2idx.items():
            cnt = co_occ_dict.pop(target_word)
            for co_occ_word_idx, co_occ_freq in cnt.items():
                # 出現頻度の低い単語を無視
                ppmi = calc_ppmi(co_occ_freq, word_count, target_word_idx,
                                 co_occ_word_idx, D)
                if ppmi > 0:
                    # sparse matrixを作成するため0以上のみ保存
                    ppmi_list.append([ppmi, target_word_idx, co_occ_word_idx])

        with open(PREPROCESSED_DATA_PATH + "ppmi_list/" + date + ".pickle",
                  mode="wb") as f:
            pickle.dump(np.array(ppmi_list), f, protocol=-1)

    return
예제 #4
0
파일: photon.py 프로젝트: xingsu56/Photon
    for match in bad_intel:
        for x in match:  # Because "match" is a tuple
            if x != '':  # If the value isn't empty
                intel.add(x)
        for url in external:
            try:
                if top_level(url, fix_protocol=True) in intels:
                    intel.add(url)
            except:
                pass

# Records the time at which crawling stopped
now = time.time()
# Finds total time taken
diff = (now - then)
minutes, seconds, time_per_request = timer(diff, processed)

# Step 4. Save the results
if not os.path.exists(output_dir):  # if the directory doesn't exist
    os.mkdir(output_dir)  # create a new directory

datasets = [
    files, intel, robots, custom, failed, internal, scripts, external,
    fuzzable, endpoints, keys
]
dataset_names = [
    'files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts',
    'external', 'fuzzable', 'endpoints', 'keys'
]

writer(datasets, dataset_names, output_dir)
예제 #5
0
    for match in bad_intel:
        for x in match:  # 因为 match 是一个元组
            if x != '':  # 如果这个值不为空
                intel.add(x)
        for url in external:
            try:
                if top_level(url, fix_protocol=True) in intels:
                    intel.add(url)
            except:
                pass

# 记录下爬取停止的时间
now = time.time()
# 计算总耗费时间
diff = (now - then)
minutes, seconds, time_per_request = timer(
    diff, processed)  # 这里涉及到 core/utils.py 文件中的 timer() 函数

# 第四步:保存这个结果
if not os.path.exists(output_dir):  # 如果这个文件夹不存在
    os.mkdir(output_dir)  # 创建新的文件夹

datasets = [
    files, intel, robots, custom, failed, internal, scripts, external,
    fuzzable, endpoints, keys
]
dataset_names = [
    'files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts',
    'external', 'fuzzable', 'endpoints', 'keys'
]

writer(datasets, dataset_names,
예제 #6
0
def make_one_day_co_occ_dict(tweet_path, window_size=21):
    '''ある日の単語の共起と単語の出現回数を保存する
    Params
    ------
    tweet_path: str
    tweet(文書)のDataFrameが保存されているPath
    DataFrameには文書が入っているカラム"body"が必要
    window size: int
    共起をカウントする検索幅、自身+前後の単語の総数
    奇数推奨
    Return
    ------
    None
    (単語の共起の隣接リスト, 単語の出現回数 array)のtuple
    を保存する
    NOTE
    ----
    tweet_path[-17:-7]中にある日付を保存する際の名前にしている
    日付の位置が違う場合は適宜変更のこと
    '''
    date = tweet_path[-17:-7]

    with open(PREPROCESSED_DATA_PATH + "filtered_word2idx.pickle",
              mode="rb") as f:
        filtered_word2idx = pickle.load(f)
    filtered_word_num = len(filtered_word2idx)

    with timer(f"load {date} data", LOGGER):
        # ツイートデータの呼び出し
        with open(tweet_path, mode="rb") as f:
            df = pickle.load(f)
        tweets = df["body"].values
        del df  #不要な変数を削除

    # 単語の共起を記録
    with do_job(f"make co_occ_dict {date}", LOGGER):
        co_occ_dict = {w: {} for w in filtered_word2idx.keys()}
        word_count = np.zeros(filtered_word_num)
        for tweet in tweets:
            splited_tweet = tweet.lower().split(" ")
            tweet_len = len(splited_tweet)
            for i, w in enumerate(splited_tweet):
                w_idx = filtered_word2idx.get(w)
                if w_idx:
                    word_count[w_idx] += 1
                else:
                    continue

                # 単語の検索範囲を指定
                window_radius = int((window_size - 1) / 2)
                first = i - window_radius
                if first < 0:
                    first = 0
                last = i + window_radius
                if last > tweet_len:
                    last = tweet_len

                # 共起語のidを保存
                for word_idx in range(first, last):
                    co_occ_word = splited_tweet[word_idx]
                    co_occ_idx = filtered_word2idx.get(co_occ_word)
                    if co_occ_idx:
                        if co_occ_dict[w].get(co_occ_idx):
                            co_occ_dict[w][co_occ_idx] += 1
                        else:
                            co_occ_dict[w][co_occ_idx] = 1

        #不要な変数を削除
        del splited_tweet

        # 保存
        save_path = PREPROCESSED_DATA_PATH + "co_occ_dict_word_count/" + date + ".pickle"
        with open(save_path, mode="wb") as f:
            pickle.dump((co_occ_dict, word_count), f, protocol=-1)

    return
예제 #7
0
def make_whole_day_co_occ_dict(TWEETS_PATHS, window_size=21):
    '''単語の共起と単語の出現回数を保存する
    Params
    ------
    TWEETS_PATHS: [str, str, ...]
    tweet(文書)のDataFrameが保存されているPathのリスト
    DataFrameには文書が入っているカラム"body"が必要
    時系列にそってsort済みであることが必要
    window_size: int
    共起をカウントする検索幅、自身+前後の単語の総数
    奇数推奨
    Return
    ------
    None
    (単語の共起の隣接リスト, 単語の出現回数 array)のtuple
    を保存して行く
    NOTE
    ----
    tweet_path[-17:-7]中にある日付を保存する際の名前にしている
    日付の位置が違う場合は適宜変更のこと
    - WORD_FREQ_MINによって対象単語が制限される
    '''
    ### 対象単語を制限
    with timer(f"filter words", LOGGER):
        # データを読み込む
        with open(PREPROCESSED_DATA_PATH + "unique_word2idx.pickle",
                  mode="rb") as f:
            word2idx = pickle.load(f)
        unique_word_num = len(word2idx.keys())

        # カウント
        word_count = np.zeros(unique_word_num)
        for tweet_path in TWEETS_PATHS[-7:]:
            with open(tweet_path, mode="rb") as f:
                df = pickle.load(f)
            tweets = df["body"].values
            del df

            for tweet in tweets:
                splited_tweet = tweet.lower().split(" ")
                for w in splited_tweet:
                    try:
                        word_count[word2idx[w]] += 1
                    except:
                        continue

        # word -> idxのマッピングで制限
        idxconverter = {}
        new_idx = 0
        for old_idx in word2idx.values():
            if word_count[old_idx] >= WORD_FREQ_MIN:
                idxconverter[old_idx] = new_idx
                new_idx += 1

        # word2idxの縮小版を作る
        filtered_word2idx = {}
        for word, old_idx in word2idx.items():
            try:
                filtered_word2idx[word] = idxconverter[old_idx]
            except KeyError:
                continue

        # 保存
        with open(PREPROCESSED_DATA_PATH + "filtered_word2idx.pickle",
                  mode="wb") as f:
            pickle.dump(filtered_word2idx, f, protocol=-1)

        # 不要な変数を削除
        del word2idx, word_count, idxconverter

    if not os.path.exists(PREPROCESSED_DATA_PATH + "co_occ_dict_word_count/"):
        os.mkdir(PREPROCESSED_DATA_PATH + "co_occ_dict_word_count/")

    with Pool(processes=N_JOB) as p:
        p.map(make_one_day_co_occ_dict, TWEETS_PATHS)

    return
예제 #8
0
    def post(self, target, level_):

        global keys
        global files
        global intel
        global robots
        global custom
        global failed
        global scripts
        global external
        global fuzzable
        global endpoints
        global processed
        global internal
        global main_url
        global delay
        global cook
        global headers
        global timeout
        global host
        global proxies
        global user_agents
        global only_urls
        global bad_intel
        global bad_scripts
        global clone
        global schema
        global args
        global supress_regex
        global results

        results = {}
        # Disable SSL related warnings
        warnings.filterwarnings('ignore')

        # Processing command line arguments
        parser = argparse.ArgumentParser()
        # Options
        parser.add_argument('-u', '--url', help='root url', dest='root')
        parser.add_argument('-c', '--cookie', help='cookie', dest='cook')
        parser.add_argument('-r',
                            '--regex',
                            help='regex pattern',
                            dest='regex')
        parser.add_argument('-e',
                            '--export',
                            help='export format',
                            dest='export',
                            choices=['csv', 'json'])
        parser.add_argument('-o',
                            '--output',
                            help='output directory',
                            dest='output')
        parser.add_argument('-l',
                            '--level',
                            help='levels to crawl',
                            dest='level',
                            type=int)
        parser.add_argument('-t',
                            '--threads',
                            help='number of threads',
                            dest='threads',
                            type=int)
        parser.add_argument('-d',
                            '--delay',
                            help='delay between requests',
                            dest='delay',
                            type=float)
        parser.add_argument('-v',
                            '--verbose',
                            help='verbose output',
                            dest='verbose',
                            action='store_true')
        parser.add_argument('-s',
                            '--seeds',
                            help='additional seed URLs',
                            dest='seeds',
                            nargs="+",
                            default=[])
        parser.add_argument('--stdout',
                            help='send variables to stdout',
                            dest='std')
        parser.add_argument('--user-agent',
                            help='custom user agent(s)',
                            dest='user_agent')
        parser.add_argument('--exclude',
                            help='exclude URLs matching this regex',
                            dest='exclude')
        parser.add_argument('--timeout',
                            help='http request timeout',
                            dest='timeout',
                            type=float)
        parser.add_argument('-p',
                            '--proxy',
                            help='Proxy server IP:PORT or DOMAIN:PORT',
                            dest='proxies',
                            type=proxy_type)

        # Switches
        parser.add_argument('--clone',
                            help='clone the website locally',
                            dest='clone',
                            action='store_true')
        parser.add_argument('--headers',
                            help='add headers',
                            dest='headers',
                            action='store_true')
        parser.add_argument('--dns',
                            help='enumerate subdomains and DNS data',
                            dest='dns',
                            action='store_true')
        parser.add_argument('--keys',
                            help='find secret keys',
                            dest='api',
                            action='store_true')
        parser.add_argument('--update',
                            help='update photon',
                            dest='update',
                            action='store_true')
        parser.add_argument('--only-urls',
                            help='only extract URLs',
                            dest='only_urls',
                            action='store_true')
        parser.add_argument('--wayback',
                            help='fetch URLs from archive.org as seeds',
                            dest='archive',
                            action='store_true')
        args = parser.parse_args()

        print("------------------------------------------------")
        print(args.root)
        print(type(args.level))
        print(type(args.threads))
        print(args.api)
        print(args.archive)
        print(args.export)
        args.root = "http://" + target
        args.level = int(level_)
        args.threads = 30
        args.api = True
        args.archive = True
        args.export = "json"

        # If the user has supplied --update argument
        if args.update:
            updater()
            quit()

        # If the user has supplied a URL
        if args.root:
            main_inp = args.root
            if main_inp.endswith('/'):
                # We will remove it as it can cause problems later in the code
                main_inp = main_inp[:-1]
        # If the user hasn't supplied an URL
        else:
            print('\n' + parser.format_help().lower())
            quit()

        clone = args.clone
        headers = args.headers  # prompt for headers
        verbose = args.verbose  # verbose output
        delay = args.delay or 0  # Delay between requests
        timeout = args.timeout or 6  # HTTP request timeout
        cook = args.cook or None  # Cookie
        api = bool(
            args.api)  # Extract high entropy strings i.e. API keys and stuff

        proxies = []
        if args.proxies:
            print("%s Testing proxies, can take a while..." % info)
            for proxy in args.proxies:
                if is_good_proxy(proxy):
                    proxies.append(proxy)
                else:
                    print("%s Proxy %s doesn't seem to work or timedout" %
                          (bad, proxy['http']))
            print("%s Done" % info)
            if not proxies:
                print("%s no working proxies, quitting!" % bad)
                exit()
        else:
            proxies.append(None)

        crawl_level = args.level or 2  # Crawling level
        thread_count = args.threads or 2  # Number of threads
        only_urls = bool(args.only_urls)  # Only URLs mode is off by default

        # Variables we are gonna use later to store stuff
        keys = set()  # High entropy strings, prolly secret keys
        files = set()  # The pdf, css, png, etc files.
        intel = set(
        )  # The email addresses, website accounts, AWS buckets etc.
        robots = set()  # The entries of robots.txt
        custom = set()  # Strings extracted by custom regex pattern
        failed = set()  # URLs that photon failed to crawl
        scripts = set()  # THe Javascript files
        external = set(
        )  # URLs that don't belong to the target i.e. out-of-scope
        # URLs that have get params in them e.g. example.com/page.php?id=2
        fuzzable = set()
        endpoints = set()  # URLs found from javascript files
        processed = set(['dummy'])  # URLs that have been crawled
        # URLs that belong to the target i.e. in-scope
        internal = set(args.seeds)

        everything = []
        bad_scripts = set()  # Unclean javascript file urls
        bad_intel = set()  # needed for intel filtering

        core.config.verbose = verbose

        if headers:
            try:
                prompt = prompt()
            except FileNotFoundError as e:
                print('Could not load headers prompt: {}'.format(e))
                quit()
            headers = extract_headers(prompt)

        # If the user hasn't supplied the root URL with http(s), we will handle it
        if main_inp.startswith('http'):
            main_url = main_inp
        else:
            try:
                requests.get('https://' + main_inp,
                             proxies=random.choice(proxies))
                main_url = 'https://' + main_inp
            except:
                main_url = 'http://' + main_inp

        schema = main_url.split('//')[0]  # https: or http:?
        # Adding the root URL to internal for crawling
        internal.add(main_url)
        # Extracts host out of the URL
        host = urlparse(main_url).netloc

        output_dir = args.output or host
        output_dir = "results"

        try:
            domain = top_level(main_url)
        except:
            domain = host

        if args.user_agent:
            user_agents = args.user_agent.split(',')
        else:
            with open(sys.path[0] + '/core/user-agents.txt', 'r') as uas:
                user_agents = [agent.strip('\n') for agent in uas]

        supress_regex = False

        # Records the time at which crawling started
        then = time.time()

        # Step 1. Extract urls from robots.txt & sitemap.xml
        zap(main_url, args.archive, domain, host, internal, robots, proxies)

        # This is so the level 1 emails are parsed as well
        internal = set(remove_regex(internal, args.exclude))

        # Step 2. Crawl recursively to the limit specified in "crawl_level"
        for level in range(crawl_level):
            # Links to crawl = (all links - already crawled links) - links not to crawl
            links = remove_regex(internal - processed, args.exclude)
            # If links to crawl are 0 i.e. all links have been crawled
            if not links:
                break
            # if crawled links are somehow more than all links. Possible? ;/
            elif len(internal) <= len(processed):
                if len(internal) > 2 + len(args.seeds):
                    break
            print('%s Level %i: %i URLs' % (run, level + 1, len(links)))
            try:
                flash(self.extractor, links, thread_count)
            except KeyboardInterrupt:
                print('')
                break

        if not only_urls:
            for match in bad_scripts:
                if match.startswith(main_url):
                    scripts.add(match)
                elif match.startswith('/') and not match.startswith('//'):
                    scripts.add(main_url + match)
                elif not match.startswith('http') and not match.startswith(
                        '//'):
                    scripts.add(main_url + '/' + match)
            # Step 3. Scan the JavaScript files for endpoints
            print('%s Crawling %i JavaScript files' % (run, len(scripts)))
            flash(self.jscanner, scripts, thread_count)

            for url in internal:
                if '=' in url:
                    fuzzable.add(url)

            for match, intel_name, url in bad_intel:
                if isinstance(match, tuple):
                    for x in match:  # Because "match" is a tuple
                        if x != '':  # If the value isn't empty
                            if intel_name == "CREDIT_CARD":
                                if not luhn(match):
                                    # garbage number
                                    continue
                            intel.add("%s:%s" % (intel_name, x))
                else:
                    if intel_name == "CREDIT_CARD":
                        if not luhn(match):
                            # garbage number
                            continue
                    intel.add("%s:%s:%s" % (url, intel_name, match))
                for url in external:
                    try:
                        if top_level(url, fix_protocol=True) in INTELS:
                            intel.add(url)
                    except:
                        pass

        # Records the time at which crawling stopped
        now = time.time()
        # Finds total time taken
        diff = (now - then)
        minutes, seconds, time_per_request = timer(diff, processed)

        # Step 4. Save the results
        if not os.path.exists(output_dir):  # if the directory doesn't exist
            os.mkdir(output_dir)  # create a new directory

        datasets = [
            files, intel, robots, custom, failed, internal, scripts, external,
            fuzzable, endpoints, keys
        ]
        dataset_names = [
            'files', 'intel', 'robots', 'custom', 'failed', 'internal',
            'scripts', 'external', 'fuzzable', 'endpoints', 'keys'
        ]

        writer(datasets, dataset_names, output_dir)
        # Printing out results
        print(('%s-%s' % (red, end)) * 50)
        for dataset, dataset_name in zip(datasets, dataset_names):
            if dataset:
                print('%s %s: %s' %
                      (good, dataset_name.capitalize(), len(dataset)))
        print(('%s-%s' % (red, end)) * 50)

        print('%s Total requests made: %i' % (info, len(processed)))
        print('%s Total time taken: %i minutes %i seconds' %
              (info, minutes, seconds))
        print('%s Requests per second: %i' %
              (info, int(len(processed) / diff)))

        datasets = {
            'files': list(files),
            'intel': list(intel),
            'robots': list(robots),
            'custom': list(custom),
            'failed': list(failed),
            'internal': list(internal),
            'scripts': list(scripts),
            'external': list(external),
            'fuzzable': list(fuzzable),
            'endpoints': list(endpoints),
            'keys': list(keys)
        }

        if args.dns:
            print('%s Enumerating subdomains' % run)
            from plugins.find_subdomains import find_subdomains
            subdomains = find_subdomains(domain)
            print('%s %i subdomains found' % (info, len(subdomains)))
            writer([subdomains], ['subdomains'], output_dir)
            datasets['subdomains'] = subdomains
            from plugins.dnsdumpster import dnsdumpster
            print('%s Generating DNS map' % run)
            dnsdumpster(domain, output_dir)

        if args.export:
            from plugins.exporter import exporter
            # exporter(directory, format, datasets)
            results = datasets
            exporter(output_dir, args.export, datasets)

        print('%s Results saved in %s%s%s directory' %
              (good, green, output_dir, end))

        if args.std:
            for string in datasets[args.std]:
                sys.stdout.write(string + '\n')

        return results, 200
예제 #9
0
                if not luhn(match):
                    # garbage number
                    continue
            intel.add("%s:%s:%s" % (url, intel_name, match))
        for url in external:
            try:
                if top_level(url, fix_protocol=True) in INTELS:
                    intel.add(url)
            except:
                pass

# Records the time at which crawling stopped
now = time.time()
# Finds total time taken
diff = (now - then)
minutes, seconds, time_per_request = timer(diff, processed)

# Step 4. Save the results
if not os.path.exists(output_dir): # if the directory doesn't exist
    os.mkdir(output_dir) # create a new directory

datasets = [files, intel, robots, custom, failed, internal, scripts,
            external, fuzzable, endpoints, keys]
dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'internal',
                 'scripts', 'external', 'fuzzable', 'endpoints', 'keys']

writer(datasets, dataset_names, output_dir)
# Printing out results
print(('%s-%s' % (red, end)) * 50)
for dataset, dataset_name in zip(datasets, dataset_names):
    if dataset:
예제 #10
0
result['urls']['internal'] = set(args.seeds)
load_modules('before-crawling')
parsed_url = urlparse(var['input_url'])
root_url = parsed_url.scheme + '://' + parsed_url.netloc
var['root_url'] = root_url
result['urls']['internal'].add(var['root_url'])
zap()  # parse sitemap.xml and robots.txt

try:
    crawler()
except KeyboardInterrupt:
    print('%s Crawler stopped' % info)

now = time.time()  # record ending time
diff = (now - then)  # total time taken
minutes, seconds, time_per_request = timer(diff, var['processed'])

result['data']['files'] = list(result['data']['files'])
result['urls']['failed'] = list(result['urls']['failed'])
result['data']['websites'] = list(result['data']['websites'])
result['urls']['internal'] = list(result['urls']['internal'])
result['urls']['external'] = list(result['urls']['external'])
result['urls']['fuzzable'] = list(result['urls']['fuzzable'])

if 'token' in result['data']['extractors']:
    valid_tokens = {}
    for token, url in result['data']['extractors']['token'].items():
        is_valid = is_token(token)
        if is_valid:
            valid_tokens[token] = url
    result['data']['extractors']['token'] = valid_tokens