示例#1
0
def count_words(index, counters):
    try:
        counter = Counter()
        unprepared_txt = os.path.join(LANG.model_dir, 'unprepared.txt')
        file_size = os.path.getsize(unprepared_txt)
        block_size = math.ceil(file_size / ARGS.workers)
        start = index * block_size
        end = min(file_size, start + block_size)
        with open(unprepared_txt, 'rb', buffering=ARGS.block_size) as unprepared_file, \
                open(get_partial_path(index), 'w', buffering=ARGS.block_size) as partial_file:
            pos = old_pos = start
            unprepared_file.seek(start)
            first = True
            while pos < end:
                line = unprepared_file.readline()
                pos = unprepared_file.tell()
                if index > 0 and first:
                    first = False
                    continue
                try:
                    line = line.decode()
                except UnicodeDecodeError:
                    continue
                lines = LANG.clean(line)
                for line in lines:
                    for word in line.split():
                        counter[word] += 1
                    partial_file.write(line + '\n')
                if len(counter.keys()) > ARGS.vocabulary_size or pos >= end:
                    counters.put((counter, pos - old_pos))
                    old_pos = pos
                    counter = Counter()
    except Exception as ex:
        announce('Shard worker {}: Error - {}'.format(index, ex))
示例#2
0
    def _gen_ln_sequential_patterns(self, prefix, l, projected_db):
        """Generates l>1 sequential patterns."""
        announce()
        candidates = Candidates(self._minsup)

        # Obtain sequences of length l that follow the prefix
        for row in projected_db:
            # Find the indices   of the suffix that follows the prefix
            indices = [
                i for i in range(len(row)) if row[i:i + len(prefix)] == prefix
            ]
            # Check to see an item was found
            if len(indices) > 0:
                # Get index that mark the start and end of the first sequence that includes the prefix
                start = indices[0]
                end = start + len(prefix) + 1
                # Confirm that the prefix isn't at the end of the row.
                if end <= len(row):
                    # Obtain the sequence
                    sequence = row[start:end]
                    candidate = Candidate(sequence)
                    candidates.add(candidate)

        sequential_patterns = candidates.get_frequent()
        sequences = self._sequential_patterns.register_patterns(
            l, sequential_patterns)

        leaving()
        return sequences
示例#3
0
 def add_pattern(self, l, pattern):
     """Adds a single l-sequence pattern to the collection"""
     announce()
     self.count += 1
     if l in self.sequential_patterns.keys():
         self.sequential_patterns[l].append(pattern)
     else:
         self.sequential_patterns[l] = [pattern]
     leaving()
示例#4
0
    def _scan_projected_db(self, prefix, l, projected_db):
        """Scans projected databases for frequent sequences"""
        announce()
        if l == 0:
            sequences = self._gen_l1_sequential_patterns(
                prefix, l, projected_db)
        else:
            sequences = self._gen_ln_sequential_patterns(
                prefix, l, projected_db)

        leaving()
        return sequences
示例#5
0
    def prefix_span(self, prefix, l, projected_db):
        announce()

        sequences = self._scan_projected_db(prefix, l, projected_db)
        if len(sequences) > 0:

            # For each sequential pattern, append to prefix and construct a projected database.
            for item in sequences:
                projected_db = ProjectedDB(
                    prefix=item,
                    projected_db=projected_db).create_projections()
                self.prefix_span(item, l + 1, projected_db)
        leaving()
示例#6
0
 def _gen_l1_sequential_patterns(self, prefix, l, projected_db):
     """Creates l1 frequent sequences as list of dictionaries."""
     announce()
     # Extract l1 candidates with support
     candidates = collections.Counter(
         itertools.chain(*map(set, projected_db)))
     # Obtain l1 sequences with minimum support
     sequential_patterns = [{
         "sequence": [item],
         "support": support
     } for item, support in candidates.items() if support >= self._minsup]
     # Register the sequential pattern with support and the method returns the sequences
     sequences = self._sequential_patterns.register_patterns(
         l, sequential_patterns)
     return sequences
示例#7
0
 def create_projections(self):
     """Generates the projections from the designated prefix."""
     announce()
     for row in self._projected_db:
         # Find indices of sequences matching the prefix in the projected database.
         indices = [
             i for i in range(len(row))
             if row[i:i + len(self.prefix)] == self.prefix
         ]
         # Check to confirm a sequence was found.
         if len(indices) > 0:
             # Get index that marks the start of the first occurrence of the prefix
             index = indices[0]
             # The projection (including the prefix) goes to the end of the row.
             projection = row[index:]
             self.projection.append(projection)
     leaving()
     return self.projection
示例#8
0
    people_menu = WebNavMenu("People search tools",
                             " - extract OSINT from multiple databases",
                             people_tools)
    social_menu = WebNavMenu(
        "Social media tools",
        " - collect OSINT from various social media platforms",
        social_tools,
    )
    paste_menu = WebNavMenu("Paste site tools",
                            "  - collect OSINT from multiple paste sites",
                            paste_tools)
    dark_menu = WebNavMenu("Dark web tools",
                           " - collect OSINT from dark web sources",
                           dark_tools)

    # Create the main menu from the sub menus
    main_menu = SuperMenu(
        "Main Menu",
        sub_menus=[
            recon_menu, people_menu, social_menu, paste_menu, dark_menu
        ],
    )
    main_menu.show_menu()


if __name__ == "__main__":
    banner()  # Display the banner
    announce(
        "New tools available in the Citadel: POCKINT, InstaLoader, CardPwn, Onioff"
    )
    main()
示例#9
0
def main():
    alphabet_txt = os.path.join(LANG.model_dir, 'alphabet.txt')
    raw_txt_gz = os.path.join(LANG.model_dir, 'raw.txt.gz')
    unprepared_txt = os.path.join(LANG.model_dir, 'unprepared.txt')
    prepared_txt = os.path.join(LANG.model_dir, 'prepared.txt')
    vocabulary_txt = os.path.join(LANG.model_dir, 'vocabulary.txt')
    unfiltered_arpa = os.path.join(LANG.model_dir, 'unfiltered.arpa')
    filtered_arpa = os.path.join(LANG.model_dir, 'filtered.arpa')
    lm_binary = os.path.join(LANG.model_dir, 'lm.binary')
    kenlm_scorer = os.path.join(LANG.model_dir, 'kenlm.scorer')
    temp_prefix = os.path.join(LANG.model_dir, 'tmp')

    section('Writing alphabet file', empty_lines_before=1)
    with open(alphabet_txt, 'w', encoding='utf-8') as alphabet_file:
        alphabet_file.write('\n'.join(LANG.alphabet) + '\n')

    redo = ARGS.force_download

    section('Downloading text data')
    redo = maybe_download(LANG.text_url, raw_txt_gz, force=redo)

    section('Unzipping text data')
    redo = maybe_ungzip(raw_txt_gz, unprepared_txt, force=redo)

    redo = redo or ARGS.force_prepare

    section('Preparing text and building vocabulary')
    if redo or not os.path.isfile(prepared_txt) or not os.path.isfile(vocabulary_txt):
        redo = True
        announce('Preparing {} shards of "{}"...'.format(ARGS.workers, unprepared_txt))
        counters = Queue(ARGS.workers)
        source_bytes = os.path.getsize(unprepared_txt)
        aggregator_process = Process(target=aggregate_counters, args=(vocabulary_txt, source_bytes, counters))
        aggregator_process.start()
        counter_processes = list(map(lambda index: Process(target=count_words, args=(index, counters)),
                                     range(ARGS.workers)))
        try:
            for p in counter_processes:
                p.start()
            for p in counter_processes:
                p.join()
            counters.put(STOP_TOKEN)
            aggregator_process.join()
            print('')
            partials = list(map(lambda i: get_partial_path(i), range(ARGS.workers)))
            join_files(partials, prepared_txt)
            for partial in partials:
                os.unlink(partial)
        except KeyboardInterrupt:
            aggregator_process.terminate()
            for p in counter_processes:
                p.terminate()
            raise
    else:
        announce('Files "{}" and \n\t"{}" existing - not preparing'.format(prepared_txt, vocabulary_txt))

    redo = redo or ARGS.force_generate

    section('Building unfiltered language model')
    if redo or not os.path.isfile(unfiltered_arpa):
        redo = True
        lmplz_args = [
            KENLM_BIN + '/lmplz',
            '--temp_prefix', temp_prefix,
            '--memory', '80%',
            '--discount_fallback',
            '--limit_vocab_file', vocabulary_txt,
            '--text', prepared_txt,
            '--arpa', unfiltered_arpa,
            '--skip', 'symbols',
            '--order', str(LANG.order)
        ]
        if len(LANG.prune) > 0:
            lmplz_args.append('--prune')
            lmplz_args.extend(list(map(str, LANG.prune)))
        subprocess.check_call(lmplz_args)
    else:
        announce('File "{}" existing - not generating'.format(unfiltered_arpa))

    section('Filtering language model')
    if redo or not os.path.isfile(filtered_arpa):
        redo = True
        with open(vocabulary_txt, 'rb') as vocabulary_file:
            vocabulary_content = vocabulary_file.read()
        subprocess.run([
            KENLM_BIN + '/filter',
            'single',
            'model:' + unfiltered_arpa,
            filtered_arpa
        ], input=vocabulary_content, check=True)
    else:
        announce('File "{}" existing - not filtering'.format(filtered_arpa))

    section('Generating binary representation')
    if redo or not os.path.isfile(lm_binary):
        redo = True
        subprocess.check_call([
            KENLM_BIN + '/build_binary',
            '-a', '255',
            '-q', '8',
            '-v',
            'trie',
            filtered_arpa,
            lm_binary
        ])
    else:
        announce('File "{}" existing - not generating'.format(lm_binary))

    section('Building scorer')
    if redo or not os.path.isfile(kenlm_scorer):
        redo = True
        words = set()
        vocab_looks_char_based = True
        with open(vocabulary_txt) as vocabulary_file:
            for line in vocabulary_file:
                for word in line.split():
                    words.add(word.encode())
                    if len(word) > 1:
                        vocab_looks_char_based = False
        announce("{} unique words read from vocabulary file.".format(len(words)))
        announce(
            "{} like a character based model.".format(
                "Looks" if vocab_looks_char_based else "Doesn't look"
            )
        )
        if ARGS.alphabet_mode == 'auto':
            use_utf8 = vocab_looks_char_based
        elif ARGS.alphabet_mode == 'utf8':
            use_utf8 = True
        else:
            use_utf8 = False
        serialized_alphabet = get_serialized_utf8_alphabet() if use_utf8 else LANG.get_serialized_alphabet()
        from ds_ctcdecoder import Scorer, Alphabet
        alphabet = Alphabet()
        err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet))
        if err != 0:
            announce('Error loading alphabet: {}'.format(err))
            sys.exit(1)
        scorer = Scorer()
        scorer.set_alphabet(alphabet)
        scorer.set_utf8_mode(use_utf8)
        scorer.reset_params(LANG.alpha, LANG.beta)
        scorer.load_lm(lm_binary)
        scorer.fill_dictionary(list(words))
        shutil.copy(lm_binary, kenlm_scorer)
        scorer.save_dictionary(kenlm_scorer, True)  # append, not overwrite
        announce('Package created in {}'.format(kenlm_scorer))
        announce('Testing package...')
        scorer = Scorer()
        scorer.load_lm(kenlm_scorer)
    else:
        announce('File "{}" existing - not building'.format(kenlm_scorer))
示例#10
0
    parser.add_argument('--alphabet-mode', choices=['auto', 'utf8', 'specific'], default='auto',
                        help='if alphabet-mode should be determined from the vocabulary (auto), '
                             'or the alphabet should be all utf-8 characters (utf8), '
                             'or the alphabet should be language specific (specific)')
    parser.add_argument('--force-download', action='store_true',
                        help='forces downloading, preparing and generating from scratch')
    parser.add_argument('--force-prepare', action='store_true',
                        help='forces preparing and generating from scratch (reusing available download)')
    parser.add_argument('--force-generate', action='store_true',
                        help='forces generating from scratch (reusing prepared data)')
    return parser.parse_args()


if __name__ == '__main__':
    ARGS = parse_args()
    LANG = get_language(ARGS.language)
    if ARGS.order is not None:
        LANG.order = ARGS.order
    if ARGS.prune is not None:
        LANG.prune = list(map(int, ARGS.prune.split(':')))
    if ARGS.alpha is not None:
        LANG.alpha = ARGS.alpha
    if ARGS.beta is not None:
        LANG.beta = ARGS.beta
    ARGS.block_size = parse_file_size(ARGS.block_size)
    try:
        main()
    except KeyboardInterrupt:
        announce('\nInterrupted')
        sys.exit()