Exemplo n.º 1
0
def main():
    args = arg_parse()
    docs = [
        os.path.join(dir, f) for dir in os.listdir(args.root)
        for f in os.listdir(os.path.join(args.root, dir))
    ]
    docs = sorted(docs)

    index = Indexer(docs, args.index, args.root)
    embedder = Embedder()

    # L0
    q_pos, q_neg = query_expand(args.query)
    # Get all OR-ed tokens
    q_pos_expand = re.sub(r" ", " AND ", q_pos)
    hits = index.query_boolean(q_pos_expand.split())
    # Remove all NOT-ed tokens
    if q_neg:
        for token in q_neg.split():
            term = index.stemmer.stem(token)
            try:
                not_posting = index.tfidf(index.index[term])
            except KeyError:
                not_posting = []
            hits = not_and_postings(not_posting, hits)

    if not hits:
        print("nothing found")
        return

    hits = sorted(hits, key=lambda item: item[1], reverse=True)
    hits = hits[:args.l0_size]

    # L1
    doc_ids = [x[0] for x in hits]
    filenames = [os.path.join(args.root, docs[i]) for i in doc_ids]
    texts = [get_text_reduced(x, maxlen=512) for x in filenames]

    if args.batch_size >= args.l0_size:
        embeddings = embedder.embed(texts)
    else:
        embeddings = batch_embed(embedder, texts, args.batch_size)
    query_emb = embedder.embed([q_pos])[0]
    dist_cos = [cosine(query_emb, e) for e in embeddings]
    idx_cos = np.argsort(dist_cos)

    # Render
    q_red = query_reduce(args.query)
    resorted = [doc_ids[i] for i in idx_cos]
    for i, id in enumerate(resorted[:args.l1_size]):
        print("\n{}:".format(i))
        index.render_file(q_red.split(), docs[id])
        orig_pos = idx_cos[i]
        print("\tL0 rank = {}; tf-idf = {:.3f}; cos-sim = {:.3f}".format(
            orig_pos, hits[orig_pos][1], 1 - dist_cos[orig_pos]))
Exemplo n.º 2
0
def calc_embeddings(docs: List[str], batch_size: int, root: str) -> np.ndarray:
    """Calculate embeddings (in batches).

    Args:
        docs: List of documents filenames.
        batch_size: Batch size.
        root: Root directory.

    Returns:
        Numpy array of (N, 768) of texts embeddings.

    """
    embedder = Embedder()
    all_embeddings = np.zeros((len(docs), 768), dtype=np.float32)

    iters = len(docs) // batch_size
    if len(docs) % batch_size > 0:
        iters += 1

    for i in trange(iters):
        batch = docs[i * batch_size:(i + 1) * batch_size]
        filenames = [os.path.join(root, doc) for doc in batch]
        texts = [get_text_reduced(x, maxlen=512) for x in filenames]
        embeddings = embedder.embed(texts)
        all_embeddings[i * batch_size:(i + 1) * batch_size] = embeddings
    return all_embeddings
Exemplo n.º 3
0
def embed(stego, secret, chromosome):
    """Embed secret message into the host using the chromosome"""
    if len(chromosome) > 7:
        chromosome = helper_individual.packchromosome(chromosome)

    # Convert to a flattened pixel sequence
    stego_sequence = MatScanner.scan_genetic(stego, chromosome)
    secret = secret.flatten()
    stego_sequence = Embedder.embed(stego_sequence, secret, chromosome)

    # Reshape the stego image
    return MatScanner.reshape_genetic(stego_sequence, stego.shape, chromosome)
def main():
    embedder = Embedder()
    parser = DocParser()
    # iterate through grobid
    with open('grobid_data.pkl', 'wb') as output:
        for subdir, dirs, files in os.walk(grobid_path):
            print(len(files))
            count = 0
            for file in files:
                print(count)
                count += 1
                # print(os.path.join(subdir, file))

                doc = parser.parseXML(os.path.join(subdir, file))
                doc.id = str(file).split('.')[0]
                if len(doc.abstract) == 0:
                    continue
                doc.embedding = embedder.embed(doc.abstract)
                # pair = variablesFromPair((doc.abstract, doc.title), word_index, embedding_map)
                # if (len(pair[0]) == 0 or len(pair[1]) == 0):
                #     continue
                # doc.embedding = encode(encoder, pair[0])
                pickle.dump(doc, output, pickle.HIGHEST_PROTOCOL)
Exemplo n.º 5
0
def batch_embed(embedder: Embedder, texts: List[str],
                batch_size: int) -> np.ndarray:
    """Get embeddings in batches if GPU memory is not enough.

    Args:
        embedder: Embedder with DistilBERT model.
        texts: List of songs' texts.
        batch_size: Batch size.

    Returns:
        Numpy array of (N, 768) of texts embeddings.

    """
    embeddings = np.zeros((len(texts), 768))

    iters = len(texts) // batch_size
    if len(texts) % batch_size > 0:
        iters += 1

    for i in range(iters):
        batch = texts[i * batch_size:(i + 1) * batch_size]
        emb_batch = embedder.embed(batch)
        embeddings[i * batch_size:(i + 1) * batch_size] = emb_batch
    return embeddings
Exemplo n.º 6
0
biometric = cv2.imread(BIOMETRIC)
carrier = cv2.imread(CARRIER)
expected = cv2.cvtColor(cv2.imread(EXPECTED), cv2.COLOR_BGR2GRAY)

BIOMETRIC_SIZE = biometric.shape[0]
actualSize = carrier.shape
corners = [[0, 0], [0, 0], [0, 0], [0, 0]]

embedded = carrier.copy()

em = Embedder(PASSES, KEY, CHANNELS, DEPTH, MASK)
ex = Extractor(PASSES, KEY, ~MASK, SHIFT)

em_psnr = []
ex_psnr = []
for iteration in range(ITERATIONS):
    embedded = em.embed(biometric, embedded)
    filename = './CarrierVsRecoved/Iteration_' + str(iteration) + '.jpg'
    cv2.imwrite(filename, embedded)
    embedded = cv2.imread(filename)
    em_psnr.append(cv2.PSNR(carrier, embedded))

    extracted = ex.extract(embedded, corners, actualSize, BIOMETRIC_SIZE)
    ex_psnr.append(cv2.PSNR(expected, extracted))
    print(iteration, em_psnr[-1], ex_psnr[-1])

cv2.imwrite('./CarrierVsRecoved/final.png', extracted)
plt.plot(em_psnr, 'b')
plt.plot(ex_psnr, 'r')
plt.show()
Exemplo n.º 7
0
x1 = int(CARRIER_SIZE[0] / 2) - int(CARRIER_SIZE[0] / 4)
x2 = int(CARRIER_SIZE[0] / 2) + int(CARRIER_SIZE[0] / 4)

y1 = int(CARRIER_SIZE[1] / 2) - int(CARRIER_SIZE[1] / 4)
y2 = int(CARRIER_SIZE[1] / 2) + int(CARRIER_SIZE[1] / 4)
crop = (x1, y1, x2, y2)

untouched = carrier[crop[0]:crop[2], crop[1]:crop[3]]

em_psnr = []
ex_psnr = []
for passes in range(MAX_PASSES):
    em = Embedder(passes, KEY, CHANNELS, DEPTH, MASK)
    ex = Extractor(passes, KEY, ~MASK, SHIFT)

    embedded = em.embed(biometric, carrier)

    cropped = embedded[crop[0]:crop[2], crop[1]:crop[3]]
    margins = gl.getMargins(cropped, GUIDE_MASK)
    corners = gl.marginsToCorners(margins)
    actualSize = gl.getActualSize(cropped, margins)

    extracted = ex.extract(cropped, corners, actualSize, BIOMETRIC_SIZE)
    ex_psnr.append(cv2.PSNR(expected, extracted))
    em_psnr.append(cv2.PSNR(np.uint8(cropped), np.uint8(untouched)))
    print(passes, em_psnr[-1], ex_psnr[-1])

cv2.imwrite('./final.png', extracted)
plt.plot(em_psnr, 'b')
plt.plot(ex_psnr, 'r')
plt.show()