Exemplos de MinHashLSH.insertion_session em Python, exemplos de datasketch.MinHashLSH.insertion_session em Python

Exemplo n.º 1

0

Exibir arquivo

 def build_lsh(self, threshold=0.5):
     start = time.time()
     print 'Buidling LSH...'
     lsh = MinHashLSH(threshold=threshold, num_perm=128)
     with lsh.insertion_session() as session:
         for i, entity in enumerate(self.entities):
             session.insert(i, self.minhash(entity.value))
     print '[{} s]'.format(time.time() - start)
     return lsh

Exemplo n.º 2

0

Exibir arquivo

def main() -> None:
    minhashes = []
    files = []
    for iterator in tqdm(range(config.COUNT_UNQ_MHS),
                         desc="Generate minHashes:"):
        minhash = MinHash(num_perm=256)
        file = []
        for _ in range(200):
            rand_string = ''.join(
                random.choice(string.ascii_lowercase) for _ in range(5))
            file.append(rand_string)
        files.append(file)
        minhash.update_batch([s.encode('utf-8') for s in file])
        minhashes.append(("key" + str(iterator), minhash))

    lsh = MinHashLSH(threshold=0.5,
                     num_perm=256,
                     storage_config={
                         'type': 'cassandra',
                         'basename': b'perftest',
                         'cassandra': {
                             'seeds': ['127.0.0.1'],
                             'keyspace': config.KEY_SPACE,
                             'replication': {
                                 'class': 'SimpleStrategy',
                                 'replication_factor': '1',
                             },
                             'drop_keyspace': False,
                             'drop_tables': False,
                         }
                     })

    for _ in tqdm(range(1), desc="Insert 100 minHashes:"):
        with lsh.insertion_session(buffer_size=100) as session:
            for key, minhash in minhashes:
                session.insert(key, minhash)

    f_disc_mhs = open('minhashes.txt', 'w+')
    for minhash in tqdm(minhashes, desc="Log minHashes:"):
        log(f_disc_mhs, minhash[0], minhash[1].digest())
    f_disc_mhs.close()

    f_disc_files = open('files.txt', 'w+')
    for iterator in tqdm(range(len(files)), desc="Log files:"):
        log(f_disc_files, minhashes[iterator][0], files[iterator])
    f_disc_mhs.close()

Exemplo n.º 3

0

Exibir arquivo

    def similarity_threshold_bulk(self,
                                  df_library,
                                  df_query,
                                  only_positive=False,
                                  return_df=False):
        """
        Takes a dataframe of 'library' strings to query against, and a dataframe of query strings. 
        Gives these unique IDs.
        Transforms both the library and the query strings into minhash objects.
        If return_df==True then df_query will be returned with a column showing how many similar utterances 
            have been found in df_library.
        TODO: maybe use redis in production
        """
        from datasketch import MinHashLSH

        lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
        data_library = self.dataframe_to_data_list(df_library, 'lib_')
        data_query = self.dataframe_to_data_list(df_query, 'query_')

        # use an insertion session to create an lsh object with all the lib data that can be queried
        with lsh.insertion_session() as session:
            for key, minhash in data_library:
                session.insert(key, minhash)

        # bulk query the data_query objects against lsh
        query_results = []
        df_query['no_similar'] = 0

        for key, minhash in data_query:
            query_result = lsh.query(minhash)
            query_result_length = len(query_result)

            if return_df:
                df_query.loc[key, 'no_similar'] = len(query_result)
            elif only_positive:
                # only need to care about only_positive if not returning a dataframe
                if query_result_length > 0:
                    query_results.append(
                        (key, query_result, query_result_length))
            else:
                query_results.append((key, query_result, query_result_length))

        if return_df:
            return df_query
        else:
            return query_results

Exemplo n.º 4

0

Exibir arquivo

Arquivo: nn.py Projeto: ChenghaoMou/text-dedup

def lsh_clustering(
    signatures: List[np.ndarray],
    threshold: float = 0.5,
    num_perm: int = 128,
):
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    with lsh.insertion_session() as session:
        for key, minhash in enumerate(signatures):
            session.insert(f"id-{key}",
                           MinHash(num_perm=num_perm, hashvalues=minhash))

    neighbors: List[List[int]] = []

    for key, minhash in enumerate(signatures):
        result = lsh.query(MinHash(num_perm=num_perm, hashvalues=minhash))
        neighbors.append([int(x.split("-")[1]) for x in result])

    return neighbors

Exemplo n.º 5

0

Exibir arquivo

Arquivo: mass_column_sim.py Projeto: minhptx/ieee-bigdata2019-transformation

def mass_values_jaccard(cols1: List[Column], cols2: List[Column]):
    lsh = MinHashLSH(
        threshold=0.2,
        num_perm=128,
        storage_config={
            "type": "redis",
            "redis": {
                "host": "localhost",
                "port": 6379
            }
        },
    )

    with lsh.insertion_session() as session:
        for idx, col in enumerate(cols1):
            session.insert(str(idx), col.values)

    result = lsh.query()

Exemplo n.º 6

0

Exibir arquivo

def init_lshs(directory, type, threshold):
    """Initilize and calculate LSH for the document database

    Args:
        directory (str): the directory with source files
        type (str): type of ngrams to use ('char', 'word')
        threshold (float): Jaccard threshold value

    Returns:
        lsh: datasketch object
    """
    # Create a MinHashLSH index using Redis as the storage layer
    lsh = MinHashLSH(threshold=threshold, num_perm=128,
                     storage_config={'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379, 'db': 1},
                                     'name': 1})

    data_list = []

    for f in os.listdir(directory):
        minhash = MinHash(num_perm=128)
        if type == 'char':
            filename, text = utils.read_file(os.path.join(directory, f))
            print filename
            for d in nltk.ngrams(text, 3):
                minhash.update("".join(d).encode('utf-8'))
        elif type == 'word':
            filename, text = utils.tokenize_file(os.path.join(directory, f))
            print filename
            for d in nltk.ngrams(text, 3):
                minhash.update(" ".join(d).encode('utf-8'))

        data_list.append((filename, minhash))

    with lsh.insertion_session() as session:
        for key, minhash in data_list:
            session.insert(key, minhash)

    return lsh

Exemplo n.º 7

0

Exibir arquivo

def main(SAMPLE_SIZE, output_type):
    tweet_data = {}
    conn = connect_to_database()
    executor = conn.cursor()
    executor.execute("SELECT * FROM tweets")
    tweet_query = executor.fetchall()
    print("Getting tweets...")
    tweets = get_tweets(tweet_query, SAMPLE_SIZE)
    pool = cf.ProcessPoolExecutor()
    tweet_results = [pool.submit(get_words, tweet) for tweet in tweets]
    for tweet in cf.as_completed(tweet_results):
        tweet_data[tweet.result()["nameid"]] = tweet.result()
    pool.shutdown()

    word_counts = list(map(lambda d: d["word_counts"], tweet_data.values()))
    packages = []
    for i in range(8):
        packages.append(word_counts[((i) * (SAMPLE_SIZE // 8)):(i + 1) *
                                    (SAMPLE_SIZE // 8)])

    package_pool = cf.ProcessPoolExecutor(max_workers=8)
    package_results = [
        package_pool.submit(sum, counts, collections.Counter())
        for counts in packages
    ]
    word_sums = [f.result() for f in cf.as_completed(package_results)]
    package_pool.shutdown()

    all_words_seen = sum(word_sums, collections.Counter())
    words_to_remove = build_people_and_find_words(tweets, all_words_seen)

    print("Removing extraneous...")
    tweets_to_remove = []
    for tweet in tqdm(tweet_data.values(), desc="tweets"):
        for word in words_to_remove:
            if word in tweet["word_counts"]:
                del tweet["word_counts"][word]
        tweet["square_sum"] = math.sqrt(
            sum(map((lambda x: x**2), tweet["word_counts"].values())))
        if tweet["square_sum"] == 0:
            tweets_to_remove.append(tweet["nameid"])

    for nameid in tweets_to_remove:
        del tweet_data[nameid]

    tweets = tweet_data.values()

    print("Preliminary pairing...")
    prelim_data = list(
        map(lambda d: (d["nameid"], set_to_minhash(d["word_counts"])), tweets))
    prelim_similarities = MinHashLSH(threshold=LSH_LENIENCY, num_perm=128)  #.6
    with prelim_similarities.insertion_session() as session:
        for (key, minhash) in prelim_data:
            session.insert(key, minhash)
    pairs_to_check = {}
    for tweet in tqdm(tweets):
        pairs = [
            match for match in prelim_similarities.query(tweet["minHash"])
            if match != tweet["nameid"]
        ]
        if len(pairs) > 0:
            pairs_to_check[tweet["nameid"]] = pairs
            for pair in pairs:
                if pair not in pairs_to_check:
                    pairs_to_check[pair] = []

    tweets_to_remove = []
    for tweet in tweet_data:
        if tweet not in pairs_to_check:
            tweets_to_remove.append(tweet)
    for tweet in tweets_to_remove:
        del tweet_data[tweet]

    print("Sanity Checks...")
    people = list(tweet_data.keys())
    p1 = people[0]
    p2 = people[0]
    print(cos_dist(tweet_data[p1], tweet_data[p2]))
    p1_name = tweet_data[p1]["user"]["name"]
    for (nameid, tweet) in tweet_data.items():
        if tweet["user"]["name"] == p1_name and tweet["nameid"] != p1:
            print("found other tweet")
            p2 = nameid
            break
    print(cos_dist(tweet_data[p1], tweet_data[p2]))

    for (nameid, tweet) in tweet_data.items():
        if tweet["user"]["name"] != p1_name:
            print("found seperate tweet")
            p2 = nameid
            break
    print(cos_dist(tweet_data[p1], tweet_data[p2]))

    print("Pairing...")
    distance_pool = cf.ProcessPoolExecutor(max_workers=8)
    future_results = []
    similarities = {}
    for (person, potentials) in tqdm(pairs_to_check.items(), desc="prep"):
        if person in tweet_data:
            tweet_data[person]["processed"] = True
            similarities[person] = {}
            for relation in potentials:
                if not tweet_data.get(relation,
                                      {"processed": True})["processed"]:
                    future_results.append(
                        distance_pool.submit(cos_dist, tweet_data[person],
                                             tweet_data[relation]))

    for comparison in tqdm(cf.as_completed(future_results), desc="futures"):
        result = comparison.result()
        similarities[result[0]][result[1]] = result[2]
    distance_pool.shutdown()

    for (person, comparisons) in similarities.items():
        for (relation, weight) in comparisons.items():
            if relation not in similarities:
                similarities[relation] = {}
            if person not in similarities[relation]:
                similarities[relation][person] = weight

    print("Outputting...")
    if output_type == "csv":
        similarity_frame = pd.DataFrame(similarities)
        similarity_frame.to_csv("./similarity_matrix.csv", na_rep=1)
    elif output_type == "json":
        output_to_json("./writeTest.json", similarities, tweet_data)
    elif output_type == "csv+json":
        similarity_frame = pd.DataFrame(similarities)
        similarity_frame.to_csv("./similarity_matrix.csv", na_rep=1)
        print("Outputted to csv")
        output_to_json("./writeTest.json", similarities, tweet_data)
    elif output_type == "none":
        print("Did not write data.")
    print("Completed.")

Exemplo n.º 8

0

Exibir arquivo

Arquivo: async_redis_storage_benchmark.py Projeto: zjiaksmc/datasketch

def insertion_session_syncredis(lsh: MinHashLSH, data: list, buffer_size: int):
    with lsh.insertion_session(buffer_size=buffer_size) as session:
        for key, minhash in data:
            session.insert(key, minhash, check_duplication=False)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: find_duplicates_lsh.py Projeto: ml-research/MoRT_NMI

def main(corpus: str, mode: str, lsh_file: str, minhash_file: str,
         num_perm: int, shingles: int, threshold: float, n_jobs: int,
         output_dir: str):
    if mode != 'query':
        assert not lsh_file
    if mode == 'minhash-only':
        assert not minhash_file

    print("Making output dir:", output_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir()

    if minhash_file:
        print('Loading MinHashes from disk:', minhash_file)
        start = time.time()
        with open(minhash_file, 'rb') as f:
            cached_minhashes = pickle.load(f)
        print("Done loading MinHashes, time elapsed (sec):",
              time.time() - start)

        corpus_len = len(cached_minhashes)
        minhash_iter = starmap(lambda k, v: (k, v, None),
                               cached_minhashes.items())
        minhashes = None  # Set to none to disable saving minhashes again
    else:
        if corpus == 'webtext':
            corpus_len = 8_282_020
            corpus_iter = make_corpus_iter(DATA_DIR / 'webtext_detokenized')
        elif corpus == 'openwebtext':
            corpus_len = 8_013_769
            corpus_iter = make_corpus_iter(DATA_DIR / 'openwebtext_shards')
        else:
            raise RuntimeError
        print("Using", n_jobs, "processes for MinHashing")

        minhashes = {}
        minhash_iter = parallel_create_minhashes(corpus_iter,
                                                 shingles=shingles,
                                                 num_perm=num_perm,
                                                 n_jobs=n_jobs)

    print("Starting...")
    if mode == 'lsh' or mode == 'lsh-ensemble':
        if mode == 'lsh':
            lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
            with lsh.insertion_session() as session:
                for key, minhash, size in tqdm(minhash_iter,
                                               total=corpus_len,
                                               desc='Making MinHashLSH'):
                    if minhashes:
                        minhashes[key] = minhash
                    session.insert(
                        key, minhash,
                        check_duplication=False)  # All keys are unique doc ids
        else:
            assert mode == 'lsh-ensemble'
            lsh = MinHashLSHEnsemble(threshold=threshold,
                                     num_perm=num_perm,
                                     num_part=16)  # TODO: try 32
            lsh.index(
                tqdm(minhash_iter,
                     total=corpus_len,
                     desc='Making MinHashLSHEnsemble'))

        # Save LSH
        print("Saving LSH...")
        start = time.time()
        with open(output_dir / 'lsh.pkl', 'wb') as f:
            pickle.dump(lsh, f)
        print("Done saving LSH, time elapsed (sec):", time.time() - start)
    elif mode == 'query':
        print('Loading LSH:', lsh_file)
        start = time.time()
        with open(lsh_file, 'rb') as f:
            lsh = pickle.load(f)
        assert isinstance(lsh, MinHashLSH) and lsh.h == num_perm
        print("Done loading LSH, time elapsed (sec):", time.time() - start)

        duplicates_file = output_dir / 'duplicates.jsonl'
        print("Writing duplicates to", duplicates_file)
        with open(duplicates_file, 'a') as f:
            for key, minhash, size in tqdm(minhash_iter,
                                           total=corpus_len,
                                           desc='Querying MinHashLSH'):
                if minhashes:
                    minhashes[key] = minhash
                duplicates = lsh.query(minhash)
                if duplicates:
                    json.dump({key: duplicates}, f)
                    f.write('\n')
    elif mode == 'minhash-only':
        assert minhashes is not None
        for key, minhash, size in tqdm(minhash_iter,
                                       total=corpus_len,
                                       desc='MinHashing'):
            minhashes[key] = minhash
    else:
        raise RuntimeError

    # Save MinHashes
    if not minhash_file:
        print("Saving MinHashes...")
        start = time.time()
        with open(output_dir / 'minhashes.pkl', 'wb') as f:
            pickle.dump(minhashes, f)
        print("Done saving MinHashes, time elapsed (sec):",
              time.time() - start)

Exemplo n.º 10

0

Exibir arquivo

class StreamData:
    """ Stream in data sequentially. Uses pandas dataframes for intra-batch
    deduping and optional lsh_hash for historical deduping (sublinear complexity)
    """
    def __init__(self,
                 filename,
                 chunk=250,
                 min_len=25,
                 clean_fnc=clean_string,
                 lsh_hash=True,
                 use_column=None):

        self.__dict__.update(locals())
        self.generator = pd.read_csv(filename, chunksize=chunk)

        self.n_processed = 0
        if self.lsh_hash == True:
            self.lsh_hash = MinHashLSH(threshold=0.995, num_perm=128)

    def __call__(self):
        """ Get a batch from the generator """
        return self._process(self.stream())

    def stream(self):
        """ Iterate generator """
        return next(self.generator)

    def _init_data(self, num_chunks):
        """ Generate a bunch of data to serve as initialization """
        return pd.concat([self.__call__() for _ in range(num_chunks)])

    def _process(self, batch):
        """ If use_column is specified, use that to make a new column of
        processed text data, remove rows where processed is less than min_len.
        From the resulting dataframe, remove duplicates. """
        if self.use_column is not None:
            batch = batch.assign(processed=self._clean(batch[self.use_column]))
            batch = batch[[
                len(s.split()) > self.min_len for s in batch.processed
            ]]
        deduped = self._dedupe(batch)
        self.n_processed += len(deduped)
        return deduped

    def _clean(self, batch):
        """ Clean data using some function """
        if self.clean_fnc is not None:
            return [self.clean_fnc(sent) for sent in batch]
        return batch

    def _dedupe(self, dataframe):
        """ Delete duplicates of a dataframe. If use_column is specified,
        operate on the processed text. After deduping within dataframe,
        if lsh_hash is enabled check to make sure rows have also not
        already been seen before. """
        if self.use_column is not None:
            deduped = dataframe.drop_duplicates(subset=['processed'])
            if type(self.lsh_hash) == MinHashLSH:
                deduped = self._hash(dataframe)
        else:
            deduped = dataframe.drop_duplicates()

        deduped.index = range(self.n_processed,
                              self.n_processed + len(deduped))

        return deduped

    def _hash(self, dataframe):
        """ Process dataframe to delete duplicates based on Jaccard similarity
        then update hash """
        # Convert current batch to hash table
        hash_batch = self._batch_to_hash(dataframe.processed, dataframe.index)

        # Greedy, locally sensitive query to see if its a duplicate
        kept_hashes, kept_idx = self._query_hash(hash_batch)

        # Keep only non-duplicates
        dataframe = dataframe[kept_idx]

        # Get new indexes for updates to keep things consistent in lsh dict
        indexes = range(self.n_processed, self.n_processed + len(dataframe))

        # Realign the kept hash update keys
        updates = [(i, h[1]) for i, h in zip(indexes, kept_hashes)]

        # Update the hash table
        self._update_hash(updates)

        return dataframe

    def _update_hash(self, hash_batch):
        """ After processing a batch, update lsh_hash with new entries """
        with self.lsh_hash.insertion_session() as session:
            for idx, hasher in hash_batch:
                session.insert(idx, hasher)

    def _query_hash(self, hash_batch):
        """ Query lsh_hash and ignore entries that have already been seen """
        keep_hashes, keep_idx = [], []
        for hasher in hash_batch:
            if not self.lsh_hash.query(hasher[1]):
                keep_hashes.append(hasher)
                keep_idx.append(True)
            else:
                keep_idx.append(False)

        return keep_hashes, keep_idx

    def _batch_to_hash(self, batch, indexes):
        """ Convert a list of strings to a list of
        tuples (index, hash object)"""
        return [(idx, self._str_to_hash(string))
                for idx, string in zip(indexes, batch)]

    def _str_to_hash(self, string):
        """ Convert string to locality sensitive min-hash """
        data = set(string.split())
        hasher = MinHash(num_perm=128)
        for d in data:
            hasher.update(d.encode('utf-8'))
        return hasher

Exemplo n.º 11

0

Exibir arquivo

    print(f'{time.time() - t1} secs was taken to initiate\n')

    print("Starting minhash + shingle creation....")

    with mp.Pool() as p:
        MAX_COUNT = len(k)
        for res in tqdm(p.imap(minhash_operation, params), total=MAX_COUNT):
            pass

    print("Completed creating minhash\nIndexing documents complete")
    t2 = time.time()

    lsh = MinHashLSH(threshold=0.50,
                     num_perm=NUM_PERMUTATION,
                     weights=(0.5, 0.5))
    with lsh.insertion_session() as session:
        for key in tqdm(Dict.keys(), desc="LSH processing"):
            session.insert(key=key, minhash=Dict[key])

    query = ['/OneDoc/120.txt', '/OneDoc/123.txt', '/OneDoc/117.txt'
             ]  # using the first ten documents in the k and seeing
    print(query)
    query = [Dict[i] for i in query]
    print(f"{time.time() - t2} secs was taken to create LSH")

    print("\nfinding candidate pairs.....")
    res = similarities(query, lsh)
    pprint(res)
    with open('result.csv', 'w') as f:
        for key in res.keys():
            f.write("%s,%s\n" % (key, res[key]))

Exemplo n.º 12

0

Exibir arquivo

Arquivo: matching_by_person.py Projeto: jonahfried/tweet_matching

def main(output_type):  #SAMPLE_SIZE, output_type):
    print("Getting tweets...")
    tweets = get_tweets()  #SAMPLE_SIZE)
    SAMPLE_SIZE = len(tweets)
    pool = cf.ProcessPoolExecutor()
    tweet_results = [
        pool.submit(get_words, name, tweet)
        for (name, tweet) in tweets.items()
    ]
    tweet_data = {
        tweet.result()["nameid"]: tweet.result()
        for tweet in cf.as_completed(tweet_results)
    }
    del tweets

    print("Summing Counts...")
    word_counts = list(map(lambda d: d["word_counts"], tweet_data.values()))
    packages = []
    for i in tqdm(range(8), desc="breaking_up"):
        packages.append(word_counts[((i) * (SAMPLE_SIZE // 8)):(i + 1) *
                                    (SAMPLE_SIZE // 8)])

    package_pool = cf.ProcessPoolExecutor(max_workers=8)
    package_results = [
        package_pool.submit(sum, counts, collections.Counter())
        for counts in packages
    ]
    word_sums = [f.result() for f in cf.as_completed(package_results)]

    all_words_seen = sum(word_sums, collections.Counter())
    words_to_remove = build_people_and_find_words(SAMPLE_SIZE, all_words_seen)

    for word in words_to_remove:
        del all_words_seen[word]

    print("Removing extraneous and square summing...")
    tweets_to_remove = []
    for tweet in tqdm(tweet_data.values(), desc="tweet loop"):
        for word in tqdm(words_to_remove, desc="word loop"):
            if word in tweet["word_counts"]:
                del tweet["word_counts"][word]
        if len(tweet["word_counts"].keys()) == 0:
            tweets_to_remove.append(tweet["nameid"])

#     removal_pool = cf.ProcessPoolExecutor(max_workers=8)
#     removal_results = [removal_pool.submit(remove_extraneous, tweet, words_to_remove) for tweet in tweet_data.values()]
#     tweet_data = {}
#     for finished in cf.as_completed(removal_results):
#         result = finished.result()
#         if result[0]:
#             tweet_data[result[1]] = result[2]

    for nameid in tweets_to_remove:
        del tweet_data[nameid]

#     sample_size = len(tweet_data)
#     idf = sum(map(lambda d:collections.Counter(d["word_counts"].keys()), tweet_data.values()), collections.Counter())
#     for key in idf:
#         idf[key] = sample_size/idf[key]

    for tweet in tqdm(tweet_data.values(), desc="square sum"):
        #         for word in tweet["word_counts"]:
        #             tweet["word_counts"][word] = tweet["word_counts"][word] * idf[word]
        tweet["square_sum"] = math.sqrt(
            sum(map((lambda x: x**2), tweet["word_counts"].values())))

    print("Preliminary pairing...")
    prelim_data = list(
        map(lambda d: (d["nameid"], set_to_minhash(d["word_counts"])),
            tweet_data.values()))
    prelim_similarities = MinHashLSH(threshold=0.3, num_perm=128)
    with prelim_similarities.insertion_session() as session:
        for (key, minhash) in prelim_data:
            session.insert(key, minhash)
    pairs_to_check = {}
    for tweet in tqdm(tweet_data.values()):
        pairs = [
            match for match in prelim_similarities.query(tweet["minHash"])
            if match != tweet["nameid"]
        ]
        if len(pairs) > 0:
            pairs_to_check[tweet["nameid"]] = pairs
            for pair in pairs:
                if pair not in pairs_to_check:
                    pairs_to_check[pair] = []

    print("Pairing...")
    distance_pool = cf.ProcessPoolExecutor(max_workers=8)
    future_results = []
    similarities = {}
    #     for person in tweet_data:
    #         similarities[person] = {}
    #         for relation in tweet_data:
    #             if person != relation:
    #                 future_results.append(distance_pool.submit(cos_dist, tweet_data[person], tweet_data[relation]))
    for (person, potentials) in tqdm(pairs_to_check.items(), desc="person"):
        if person in tweet_data:
            tweet_data[person]["processed"] = True
            similarities[person] = {}
            for relation in tqdm(potentials, desc="submitting"):
                if not tweet_data.get(relation,
                                      {"processed": True})["processed"]:
                    future_results.append(
                        distance_pool.submit(cos_dist, tweet_data[person],
                                             tweet_data[relation]))

    for comparison in cf.as_completed(future_results):
        result = comparison.result()
        similarities[result[0]][result[1]] = result[2]


#     for (person, comparisons) in similarities.items():
#         for (relation, weight) in comparisons.items():
#             if relation not in similarities:
#                 similarities[relation] = {}
#             if person not in similarities[relation]:
#                 similarities[relation][person] = weight

    print("Outputting...")
    if output_type == "csv":
        similarity_frame = pd.DataFrame(similarities)
        similarity_frame.to_csv("./similarity_matrix.csv", na_rep=1)
    elif output_type == "json":
        output_to_json("./writeTest.json", similarities, tweet_data)
    elif output_type == "csv+json":
        similarity_frame = pd.DataFrame(similarities)
        similarity_frame.to_csv("./similarity_matrix.csv", na_rep=1)
        output_to_json("./writeTest.json", similarities, tweet_data)
    elif output_type == "none":
        print("Did not write data.")
    print("Completed.")