Exemplo n.º 1
0
 def test_diverse(self):
     hashes = [
         0x00000000, 0x10101000, 0x10100010, 0x10001010, 0x00101010,
                     0x01010100, 0x01010001, 0x01000101, 0x00010101
     ]
     expected = [
         (0x00000000, 0x10101000),
         (0x00000000, 0x10100010),
         (0x00000000, 0x10001010),
         (0x00000000, 0x00101010),
         (0x00000000, 0x01010100),
         (0x00000000, 0x01010001),
         (0x00000000, 0x01000101),
         (0x00000000, 0x00010101),
         (0x00101010, 0x10001010),
         (0x00101010, 0x10100010),
         (0x00101010, 0x10101000),
         (0x10001010, 0x10100010),
         (0x10001010, 0x10101000),
         (0x10100010, 0x10101000),
         (0x00010101, 0x01000101),
         (0x00010101, 0x01010001),
         (0x00010101, 0x01010100),
         (0x01000101, 0x01010001),
         (0x01000101, 0x01010100),
         (0x01010001, 0x01010100)
     ]
     for blocks in range(4, 10):
         self.assertEqual(
             sorted(expected), sorted(simhash.find_all(hashes, blocks, 3)))
Exemplo n.º 2
0
    def query(self, distance=2, blocks='auto'):
        """ Find all the nearests neighbours for the dataset

        Parameters
        ----------
        distance : int, default=2
            Maximum number of differnet bits in the simhash
        blocks : int or 'auto', default='auto'
                number of blocks into which the simhash is split
                when searching for duplicates,
                see  https://github.com/seomoz/simhash-py

        Returns
        -------
        simhash : array
            the simhash value for all documents in the collection
        cluster_id : array
            the exact duplicates (documents with the same simhash)
            are grouped by in cluster_id
        dup_pairs : list
            list of tuples for the near-duplicates
        """
        from simhash import find_all

        if distance >= 64:
            raise ValueError(
                'Wrong input parameter for distance = {}'.format(distance) +
                'Must be less than 64!')

        _, cluster_id = np.unique(self._fit_shash, return_inverse=True)

        if blocks == 'auto':
            blocks = min(distance * 2, 64)
        matches = find_all(self._fit_shash, blocks, distance)
        return self._fit_shash, cluster_id, matches
Exemplo n.º 3
0
def prune(df):
    adj_list = collections.defaultdict(set)

    #Self matches
    identity_match_count = 0
    for sh, subdf in df.groupby(["simhash", "year_season"]):
        for i in subdf.index:
            for j in subdf.index:
                if i != j:
                    identity_match_count += 1
                    adj_list[i].add(j)

    #Cross matches
    for m1, m2 in simhash.find_all(df.simhash, 2, 1):
        for i in df[df.simhash == m1].index:
            for j in df[(df.simhash == m2)
                        & (df.year_season == df.loc[i].year_season)].index:
                adj_list[i].add(j)
                adj_list[j].add(i)

    rev_map, rep_map = bfs(adj_list)
    to_keep = set((i for i in rep_map
                   if i in rep_map[i])) | (set(df.index) - set(rev_map.keys()))

    df_pruned = df.loc[to_keep]

    logger.info(
        f"Removed {len(df)-len(df_pruned)} of {len(df)} documents due to similarity. Wanted to keep {len(to_keep)}"
    )

    return df_pruned
def find_changed_policies():
    policies_cache = defaultdict(lambda: defaultdict(dict))
    for h, d, y, s in util.get_pool().map(
            fnd.hash_text,
        ((data["policy_text"], data["site_url"], data["year"], data["season"])
         for data, cols in util.load_all_policies())):
        policies_cache[d][y][s] = h

    changed_pols = defaultdict(lambda: defaultdict(lambda: 0))
    all_pols = defaultdict(lambda: defaultdict(lambda: 0))

    for dom in policies_cache:
        print(policies_cache[dom])
        prev_pol = None
        for y, s in util.iter_year_season():
            try:
                print(y, s)
                print(policies_cache[dom][y])
                pol = policies_cache[dom][y][s]
            except KeyError:
                continue
            if prev_pol is not None:
                if len(simhash.find_all([prev_pol, pol], 4, 3)) != 0:
                    changed_pols[y][s] += 1
                all_pols[y][s] += 1
    return ([changed_pols[y][s] for y, s in util.iter_year_season()],
            [all_pols[y][s] for y, s in util.iter_year_season()])
Exemplo n.º 5
0
    def __find_matches(self, hashes, blocks, distance):
        start = time.time()
        m = simhash.find_all(hashes, blocks, distance)
        average_time = (time.time() - start) / len(hashes)

        for i in range(len(hashes)):
            self.find_time_dict.update({i: average_time})

        return m
Exemplo n.º 6
0
 def test_diverse(self):
     hashes = [
         0x00000000, 0x10101000, 0x10100010, 0x10001010, 0x00101010,
         0x01010100, 0x01010001, 0x01000101, 0x00010101
     ]
     expected = [(0x00000000, 0x10101000), (0x00000000, 0x10100010),
                 (0x00000000, 0x10001010), (0x00000000, 0x00101010),
                 (0x00000000, 0x01010100), (0x00000000, 0x01010001),
                 (0x00000000, 0x01000101), (0x00000000, 0x00010101),
                 (0x00101010, 0x10001010), (0x00101010, 0x10100010),
                 (0x00101010, 0x10101000), (0x10001010, 0x10100010),
                 (0x10001010, 0x10101000), (0x10100010, 0x10101000),
                 (0x00010101, 0x01000101), (0x00010101, 0x01010001),
                 (0x00010101, 0x01010100), (0x01000101, 0x01010001),
                 (0x01000101, 0x01010100), (0x01010001, 0x01010100)]
     for blocks in range(4, 10):
         self.assertEqual(sorted(expected),
                          sorted(simhash.find_all(hashes, blocks, 3)))
Exemplo n.º 7
0
def hammingCompare(outtweets, innerTwitter):
    client = retinasdk.FullClient(apiKey.retina_token,
                                  apiServer="http://api.cortical.io/rest",
                                  retinaName="en_associative")
    liteClient = retinasdk.LiteClient(apiKey.retina_token)
    res = []

    for index, outtweet in enumerate(outtweets):
        result = {}
        # get simHash
        simhash_pair = getSimHash(outtweet[2], innerTwitter, client)
        if len(simhash_pair) > 1:
            diff_bits = simhash.num_differing_bits(simhash_pair['out_hash'],
                                                   simhash_pair['in_hash'])
            hashes = [simhash_pair['out_hash'], simhash_pair['in_hash']]
            blocks = 4  # Number of blocks to use
            distance = 3  # Number of bits that may differ in matching pairs
            matches = simhash.find_all(hashes, blocks, distance)
            res.append([index, outtweet[2], matches])
    return res
Exemplo n.º 8
0
 def test_basic(self):
     hashes = [
         0x000000FF, 0x000000EF, 0x000000EE, 0x000000CE, 0x00000033,
         0x0000FF00, 0x0000EF00, 0x0000EE00, 0x0000CE00, 0x00003300,
         0x00FF0000, 0x00EF0000, 0x00EE0000, 0x00CE0000, 0x00330000,
         0xFF000000, 0xEF000000, 0xEE000000, 0xCE000000, 0x33000000
     ]
     expected = [(0x000000EF, 0x000000FF), (0x000000EE, 0x000000EF),
                 (0x000000EE, 0x000000FF), (0x000000CE, 0x000000EE),
                 (0x000000CE, 0x000000EF), (0x000000CE, 0x000000FF),
                 (0x0000EF00, 0x0000FF00), (0x0000EE00, 0x0000EF00),
                 (0x0000EE00, 0x0000FF00), (0x0000CE00, 0x0000EE00),
                 (0x0000CE00, 0x0000EF00), (0x0000CE00, 0x0000FF00),
                 (0x00EF0000, 0x00FF0000), (0x00EE0000, 0x00EF0000),
                 (0x00EE0000, 0x00FF0000), (0x00CE0000, 0x00EE0000),
                 (0x00CE0000, 0x00EF0000), (0x00CE0000, 0x00FF0000),
                 (0xEF000000, 0xFF000000), (0xEE000000, 0xEF000000),
                 (0xEE000000, 0xFF000000), (0xCE000000, 0xEE000000),
                 (0xCE000000, 0xEF000000), (0xCE000000, 0xFF000000)]
     for blocks in range(4, 10):
         self.assertEqual(sorted(expected),
                          sorted(simhash.find_all(hashes, blocks, 3)))
Exemplo n.º 9
0
 def test_basic(self):
     hashes = [
         0x000000FF, 0x000000EF, 0x000000EE, 0x000000CE, 0x00000033,
         0x0000FF00, 0x0000EF00, 0x0000EE00, 0x0000CE00, 0x00003300,
         0x00FF0000, 0x00EF0000, 0x00EE0000, 0x00CE0000, 0x00330000,
         0xFF000000, 0xEF000000, 0xEE000000, 0xCE000000, 0x33000000
     ]
     expected = [
         (0x000000EF, 0x000000FF),
         (0x000000EE, 0x000000EF),
         (0x000000EE, 0x000000FF),
         (0x000000CE, 0x000000EE),
         (0x000000CE, 0x000000EF),
         (0x000000CE, 0x000000FF),
         (0x0000EF00, 0x0000FF00),
         (0x0000EE00, 0x0000EF00),
         (0x0000EE00, 0x0000FF00),
         (0x0000CE00, 0x0000EE00),
         (0x0000CE00, 0x0000EF00),
         (0x0000CE00, 0x0000FF00),
         (0x00EF0000, 0x00FF0000),
         (0x00EE0000, 0x00EF0000),
         (0x00EE0000, 0x00FF0000),
         (0x00CE0000, 0x00EE0000),
         (0x00CE0000, 0x00EF0000),
         (0x00CE0000, 0x00FF0000),
         (0xEF000000, 0xFF000000),
         (0xEE000000, 0xEF000000),
         (0xEE000000, 0xFF000000),
         (0xCE000000, 0xEE000000),
         (0xCE000000, 0xEF000000),
         (0xCE000000, 0xFF000000)
     ]
     for blocks in range(4, 10):
         self.assertEqual(
             sorted(expected), sorted(simhash.find_all(hashes, blocks, 3)))
Exemplo n.º 10
0
if args.random:
    if args.hashes:
        print 'Random supplied with --hashes'
        exit(1)

    if not hashes:
        print 'Generating %i hashes' % args.random
        hashes = [random.randint(0, 1 << 64) for i in range(args.random)]
elif not args.hashes:
    print 'No hashes or queries supplied'
    exit(2)

class Timer(object):
    def __init__(self, name):
        self.name = name

    def __enter__(self):
        self.start = -time.time()
        print 'Starting %s' % self.name
        return self

    def __exit__(self, t, v, tb):
        self.start += time.time()
        if t:
            print '  Failed %s in %fs' % (self.name, self.start)
        else:
            print '     Ran %s in %fs' % (self.name, self.start)

with Timer('Find all'):
    len(simhash.find_all(hashes, args.blocks, args.bits))
Exemplo n.º 11
0
Arquivo: dedup.py Projeto: spikems/qa
def dedup_near(infile, outfile, b, k, debug=False):
    """
    """
    #
    removelist = []
    grplist = []

    #
    writer = open(outfile, 'w')
    reader = open(infile, 'rb')

    startColid = getStartColumn(infile)

    #
    duphash = {}  #hash -> set(lineid)

    #
    linecnt = 0
    data_h = []  #list of hash val
    index = {}  # hash val -> lineid
    data_v = {}  # lineid -> data
    for line in reader:
        #apos = line.find(' ')
        apos = getStartPos(line, startColid)

        if apos >= 0:
            hash = compute(line[apos:])
            data_h.append(hash)
            #here duplicate hash exist
            if hash in index:
                #add the same line into the same group
                #set grpid to the grpid of the last lineid with equal hash value
                if hash in duphash:
                    duphash[hash].append(linecnt)
                else:
                    #init with the first lineid
                    duphash[hash] = [index[hash]]
                    duphash[hash].append(linecnt)
            else:
                index[hash] = linecnt

            data_v[linecnt] = line
            #data_v[linecnt] = line[apos:]
        linecnt += 1

    #logger.info('lines=%s', '\n'.join([data_v[x] for x in range(5)]))
    #    logger.info('hash=%s', data_h[:5])
    if debug:
        with open('hash.txt', 'w') as hashf:
            for h in data_h:
                hashf.write('%s\n' % h)
        with open('hash_full.txt', 'w') as hashf:
            for idx in range(len(data_h)):
                hashf.write('%s %s' % (data_h[idx], data_v[idx]))

    # output the match group to .log
    grpwriter = open(outfile + '.log', 'w')
    for key in duphash.keys():
        ids = duphash[key]
        #only the first one reserved
        removelist.extend(ids[1:])
        grplist.append(ids)

        grpwriter.write('ids:%s\n' % ' '.join([str(x) for x in ids]))
        #write the group of match
        for lineid in ids:
            grpwriter.write('%s' % data_v[lineid])

        grpwriter.write('==================\n')

    logger.info('duphash removecnt=%d, linecnt = %s', len(removelist), linecnt)
    #find all pairs of match
    matches = simhash.find_all(data_h, b, k)

    marks = {}  #lineid -> groupid
    grpindex = {}  # groupid -> [lineids]
    groupid = 0

    for A, B in matches:
        grpidA, grpidB = -1, -1
        if index[A] in marks:
            grpidA = marks[index[A]]
        if index[B] in marks:
            grpidB = marks[index[B]]
        if grpidA == -1 and grpidB == -1:
            #new pair
            marks[index[A]] = groupid
            marks[index[B]] = groupid
            grpindex[groupid] = set([index[A], index[B]])

            groupid += 1
        elif grpidA == -1:
            #add B to group A
            marks[index[A]] = grpidB
            grpindex[grpidB].add(index[A])
        elif grpidB == -1:
            marks[index[B]] = grpidA
            grpindex[grpidA].add(index[B])
        else:
            #merge two old groups
            for lid in grpindex[grpidB]:
                marks[lid] = grpidA
                grpindex[grpidA].add(lid)
            grpindex[grpidB].clear()

    # output the groups
    #grpwriter = open(outfile + '.log', 'w')
    linecntx = 0
    for grp in grpindex.keys():
        if grpindex[grp]:
            ids = [lid for lid in grpindex[grp]]
            ids = sorted(ids, reverse=True)

            linecntx += len(ids[1:])
            #output the first one
            removelist.extend(ids[1:])
            grplist.append(ids)

            #output all
            grpwriter.write('ids:%s\n' % ids)
            #write the group of match
            for lineid in ids:
                grpwriter.write('%s' % data_v[lineid])

            grpwriter.write('==================\n')

    logger.info('total removecnt=%d, linecntx = %s, grpcnt=%d',
                len(removelist), linecntx, len(grpindex.keys()))

    #out put final result
    remove = set(removelist)
    for lid in range(linecnt):
        if lid not in remove and lid in data_v:
            writer.write('%s' % data_v[lid])

    # output the grplist
    with open(outfile + '.grp', 'w') as grpf:
        for grp in grplist:
            if len(grp) > 1:
                grpf.write('%s\n' % ' '.join([str(x) for x in grp]))
            else:
                grpf.write('%s\n' % grp[0])

    reader.close()
    writer.close()
Exemplo n.º 12
0
def dedup_near(data, k, b):
    removelist = []
    grplist = []

    duphash = {}  #hash -> set(lineid)
    linecnt = 0
    data_h = []  #list of hash val
    index = {}  # hash val -> lineid
    data_v = {}  # lineid -> data

    for line in data:
        hash = compute(line)
        if hash in index:
            if hash in duphash:
                duphash[hash].append(linecnt)
            else:
                duphash[hash] = [
                    index[hash],
                ]
                duphash[hash].append(linecnt)
        else:
            index[hash] = linecnt
        data_v[linecnt] = line
        data_h.append(hash)
        linecnt += 1

    for key in duphash.keys():
        ids = duphash[key]
        removelist.extend(ids[1:])
        grplist.append(ids)
    logger.info('duphash removecnt=%d, linecnt = %s', len(removelist), linecnt)
    matches = simhash.find_all(data_h, b, k)
    marks = {}  # lineid -> groupid
    grpindex = {}  # groupid -> [lineids]
    groupid = 0
    for A, B in matches:
        grpidA, grpidB = -1, -1
        if index[A] in marks:
            grpidA = marks[index[A]]
        if index[B] in marks:
            grpidB = marks[index[B]]
        if grpidA == -1 and grpidB == -1:
            # new pair
            marks[index[A]] = groupid
            marks[index[B]] = groupid
            grpindex[groupid] = set([index[A], index[B]])

            groupid += 1
        elif grpidA == -1:
            # add B to group A
            marks[index[A]] = grpidB
            grpindex[grpidB].add(index[A])
        elif grpidB == -1:
            marks[index[B]] = grpidA
            grpindex[grpidA].add(index[B])
        else:
            # merge two old groups
            for lid in grpindex[grpidB]:
                marks[lid] = grpidA
                grpindex[grpidA].add(lid)
            grpindex[grpidB].clear()

    linecntx = 0
    for grp in grpindex.keys():
        if grpindex[grp]:
            ids = [lid for lid in grpindex[grp]]
            ids = sorted(ids, reverse=True)

            linecntx += len(ids[1:])
            # output the first one
            removelist.extend(ids[1:])
            grplist.append(ids)

    logger.info('total removecnt=%d, linecntx = %s, grpcnt=%d',
                len(removelist), linecntx, len(grpindex.keys()))

    remain = []
    remove = set(removelist)
    for lid in range(linecnt):
        if lid not in remove and lid in data_v:
            remain.append(data_v[lid])

    with open('grp', 'w') as grpf:
        for grp in grplist:
            if len(grp) > 1:
                for id in grp:
                    grpf.write('%s\n' % (data_v[id].replace(" ", "")))
                grpf.write('###############\n')
    return remain
Exemplo n.º 13
0
            duphash[hash] = [index[hash]]
            duphash[hash].append(linecnt)
    else:
        index[hash] = linecnt
    data_v[linecnt] = line
    linecnt += 1

print 'duphash', duphash
for key in duphash.keys():
    ids = duphash[key]
    removelist.extend(ids[1:])
    grplist.append(ids)

b = 4
k = 3
matches = simhash.find_all(data_h, b, k)

marks = {}  # lineid -> groupid
grpindex = {}  # groupid -> [lineids]
groupid = 0

for A, B in matches:
    grpidA, grpidB = -1, -1
    if index[A] in marks:
        grpidA = marks[index[A]]
    if index[B] in marks:
        grpidB = marks[index[B]]
    if grpidA == -1 and grpidB == -1:
        marks[index[A]] = groupid
        marks[index[B]] = groupid
        grpindex[groupid] = set([index[A], index[B]])
Exemplo n.º 14
0
        print 'Random supplied with --hashes'
        exit(1)

    if not hashes:
        print 'Generating %i hashes' % args.random
        hashes = [random.randint(0, 1 << 64) for i in range(args.random)]
elif not args.hashes:
    print 'No hashes or queries supplied'
    exit(2)


class Timer(object):
    def __init__(self, name):
        self.name = name

    def __enter__(self):
        self.start = -time.time()
        print 'Starting %s' % self.name
        return self

    def __exit__(self, t, v, tb):
        self.start += time.time()
        if t:
            print '  Failed %s in %fs' % (self.name, self.start)
        else:
            print '     Ran %s in %fs' % (self.name, self.start)


with Timer('Find all'):
    len(simhash.find_all(hashes, args.blocks, args.bits))
Exemplo n.º 15
0
data = []
with Pool(4) as p:
    for d in p.imap_unordered(one, data_one):
        data.append(d)

data.sort(key=lambda t: t['date_parsed'])
by_hash = defaultdict(list)
by_root = defaultdict(list)
for i, d in enumerate(data):
    root = d['root_url']
    by_root[root].append(d)
    if hashing:
        by_hash.setdefault(d['text_simhash'], []).append(i)

if hashing:
    matching_hashes = simhash.find_all(by_hash.keys(), 8, 6)

matching_indices = {}

if hashing:
    for a, b in matching_hashes:
        for i in by_hash[a]:
            for j in by_hash[b]:
                matching_indices.setdefault(i, set()).add(j)
                matching_indices.setdefault(j, set()).add(i)

# zacinam z minimalniho i

follower = {}
following = {}
Exemplo n.º 16
0
def make_textsim_graph(filtername):    
    try:
        print("making dirs: %s" % ("../data/text_sim/%s/" % filtername))
        os.mkdirs("../data/text_sim/%s/" % filtername)
    except:
        pass
    
    p = util.get_pool()
    args = []
    for data,cols in ioutils.load_all_policies(limit=-1, filtername=filtername):
        text=data["policy_text"]
        domain=data["site_url"]
        year=str(data["year"])
        season=data["season"]
        args.append((text, domain, int(year), season))
    print("Total docs is %d" % len(args))
            
    simhashes = {}
    all_hashes = []
    sentences = {}
    for h, sentence, domain, year, season in p.map(hash_text, args):
        if sentence not in sentences:
            sentId = len(sentences)
            sentences[sentence] = sentId
        else:
            sentId = sentences[sentence]

        if h not in simhashes:
            simhashes[h] = {}
        simhashes[h][(domain,year,season)] = sentId
        all_hashes.append(h)

    matches = simhash.find_all(all_hashes,SIMHASH_THRESH+1,SIMHASH_THRESH)


    sentence_inv = {}
    for s in sorted(sentences, key=lambda x:sentences[x]):
        i = sentences[s]
        sentence_inv[i] = s
    del sentences


    lzma_filters = my_filters = [
        {
            "id": lzma.FILTER_LZMA2, 
            "preset": 9 | lzma.PRESET_EXTREME, 
            "dict_size": 100000, #~10k words in english speaker's vocab, x10 for good measure
            "lc": 3,
            "lp": 0,
            "pb": 0, # assume ascii
            "mode": lzma.MODE_NORMAL,
            "nice_len": 273,
            "mf": lzma.MF_BT4
        }
    ]
        
    adj = {}
    adj_sen = {}
    adj_sen_dom = {}
    self_match = [(h,h) for h in simhashes if len(simhashes[h]) > 1]

    if SAMPLE:
        dist_bins = [[] for i in range(10)]

    accepted = 0
    rejected = 0
    rejected_low_pass = 0
    for l,r in itertools.chain(matches, self_match):
        lpols = simhashes[l].keys()
        rpols = simhashes[r].keys()
        ldomains = set((dom for dom, _, _ in simhashes[l]))
        rdomains = set((dom for dom, _, _ in simhashes[r]))
        domains = ldomains.union(rdomains)
        if l == r or len(domains) > max(len(ldomains),len(rdomains)):
            for ld, ly, ls in lpols:
                for rd, ry, rs in rpols:
                    lt = ly * 10 + seasonToOrd[ls]
                    rt = ry * 10 + seasonToOrd[rs]
                    if lt == rt:
                        if CROSS_YEAR_ONLY:
                            continue
                        else:
                            first = "%d%s_%s" % (ly, ls, ld)
                            second = "%d%s_%s" % (ry, rs, rd)
                            pass
                    elif lt < rt:
                        first = "%d%s_%s" % (ly, ls, ld)
                        second = "%d%s_%s" % (ry, rs, rd)
                    else:
                        first = "%d%s_%s" % (ry, rs, rd)
                        second = "%d%s_%s" % (ly, ls, ld)
                    if first not in adj:
                        adj[first] = []
                    adj[first].append(second)

                    lId = simhashes[l][ld,ly,ls]
                    rId = simhashes[r][rd,ry,rs]
                    if FUZZ_THRESH == 0 and not USE_NCD:
                        #Anything will pass, no need to compute
                        comp_dist = 100
                    else:
                        comp_dist = check_distance(lId,rId,sentence_inv, lzma_filters)
                    if USE_NCD:
                        if len(sentence_inv[lId]) + len(sentence_inv[rId]) < 200:
                            comp_dist -= 0.3 #Magic offset because NCD doesn't work well on small text
                        if comp_dist > NCD_THRESH:
                            print("Ruled out %s x %s (%f, %s, %s)" % (ld, rd, comp_dist, hex(l), hex(r)))
                            rejected += 1
                            continue
                    else:
                        if SAMPLE:
                            if comp_dist != 100 and comp_dist >= 90:
                                dist_bins[100 - (comp_dist + 1)].append((lId,rId,comp_dist))
                        if comp_dist < 90:
                            rejected_low_pass += 1
                        if comp_dist < FUZZ_THRESH:
#                            print("Ruled out %s x %s (%f, %s, %s)" % (ld, rd, comp_dist, hex(l), hex(r)))
                            rejected += 1
                            continue
                    accepted += 1

                    if lId not in adj_sen_dom:
                        adj_sen_dom[lId] = set()
                    adj_sen_dom[lId].add(first)
                    adj_sen_dom[lId].add(second)

                    if lId not in adj_sen:
                        adj_sen[lId] = set()
                    adj_sen[lId].add(rId)
                    if rId not in adj_sen:
                        adj_sen[rId] = set()
                    adj_sen[rId].add(lId)

    adj_rev, adj_rep = bfs(adj)

    print("Accepted: %d, rejected: %d, low pass: %d" % (accepted, rejected, rejected_low_pass))

    if SAMPLE:
        for i in range(len(dist_bins)):
            with open("../data/text_sim/sample_0_%d.txt" % i, "w+") as f:
                if len(dist_bins[i]) <= 50:
                    sample = dist_bins[i]
                else:
                    sample = random.sample(dist_bins[i],10)
                for lId,rId,comp_dist in sample:
                    #print(lId,rId,comp_dist)
                    s1 = sentence_inv[lId]
                    s2 = sentence_inv[rId]
                    with open("../data/text_sim/s1_tmp.txt","w+") as f1: f1.write(s1)
                    with open("../data/text_sim/s2_tmp.txt","w+") as f1: f1.write(s2)
                    try:
                        diff = subprocess.check_output("echo \"diff -y <(fold -s -w72 ../data/text_sim/s1_tmp.txt) <(fold -s -w72 ../data/text_sim/s2_tmp.txt) -W 200; exit 0\" | bash", shell=True)
                    except subprocess.CalledProcessError as e:
                        if e.returncode == 2:
                            print(e.output)
                            sys.exit(1)
                        diff = e.output
                    diff = diff.decode()
                    f.write("%s\n%d\n%s\n" % ("="*40,comp_dist,"-"*40))
                    f.write("%s\n" % (diff))

    with open("../data/text_sim/%s/policy_links.json" % filtername, "w+") as f:
        write_obj = []
        i = 0
        for s in adj_rep:
            l = [dom[6:] for dom in adj_rep[s]]
            write_obj.append({"id": i, "domains": l})
            i += 1
        json.dump(write_obj, f)
Exemplo n.º 17
0
    #df.swifter.progress_bar(enable=True)
    df["policy_text"] = clean_text(df.policy_text)

    save_data(prune(df))
    return df


if __name__ == "__main__":
    df = main()

    adj_list = collections.defaultdict(set)

    #Self matches
    identity_match_count = 0
    for sh, subdf in df.groupby(["simhash", "year_season"]):
        for i in subdf.index:
            for j in subdf.index:
                if i != j:
                    identity_match_count += 1
                    adj_list[i].add(j)

    #Cross matches
    for m1, m2 in simhash.find_all(df.simhash, 2, 1):
        for i in df[df.simhash == m1].index:
            for j in df[(df.simhash == m2)
                        & (df.year_season == df.loc[i].year_season)].index:
                adj_list[i].add(j)
                adj_list[j].add(i)

    rev_map, rep_map = bfs(adj_list)