def main(): args = arg_parse() docs = [ os.path.join(dir, f) for dir in os.listdir(args.root) for f in os.listdir(os.path.join(args.root, dir)) ] docs = sorted(docs) index = Indexer(docs, args.index, args.root) embedder = Embedder() # L0 q_pos, q_neg = query_expand(args.query) # Get all OR-ed tokens q_pos_expand = re.sub(r" ", " AND ", q_pos) hits = index.query_boolean(q_pos_expand.split()) # Remove all NOT-ed tokens if q_neg: for token in q_neg.split(): term = index.stemmer.stem(token) try: not_posting = index.tfidf(index.index[term]) except KeyError: not_posting = [] hits = not_and_postings(not_posting, hits) if not hits: print("nothing found") return hits = sorted(hits, key=lambda item: item[1], reverse=True) hits = hits[:args.l0_size] # L1 doc_ids = [x[0] for x in hits] filenames = [os.path.join(args.root, docs[i]) for i in doc_ids] texts = [get_text_reduced(x, maxlen=512) for x in filenames] if args.batch_size >= args.l0_size: embeddings = embedder.embed(texts) else: embeddings = batch_embed(embedder, texts, args.batch_size) query_emb = embedder.embed([q_pos])[0] dist_cos = [cosine(query_emb, e) for e in embeddings] idx_cos = np.argsort(dist_cos) # Render q_red = query_reduce(args.query) resorted = [doc_ids[i] for i in idx_cos] for i, id in enumerate(resorted[:args.l1_size]): print("\n{}:".format(i)) index.render_file(q_red.split(), docs[id]) orig_pos = idx_cos[i] print("\tL0 rank = {}; tf-idf = {:.3f}; cos-sim = {:.3f}".format( orig_pos, hits[orig_pos][1], 1 - dist_cos[orig_pos]))
def calc_embeddings(docs: List[str], batch_size: int, root: str) -> np.ndarray: """Calculate embeddings (in batches). Args: docs: List of documents filenames. batch_size: Batch size. root: Root directory. Returns: Numpy array of (N, 768) of texts embeddings. """ embedder = Embedder() all_embeddings = np.zeros((len(docs), 768), dtype=np.float32) iters = len(docs) // batch_size if len(docs) % batch_size > 0: iters += 1 for i in trange(iters): batch = docs[i * batch_size:(i + 1) * batch_size] filenames = [os.path.join(root, doc) for doc in batch] texts = [get_text_reduced(x, maxlen=512) for x in filenames] embeddings = embedder.embed(texts) all_embeddings[i * batch_size:(i + 1) * batch_size] = embeddings return all_embeddings
def embed(stego, secret, chromosome): """Embed secret message into the host using the chromosome""" if len(chromosome) > 7: chromosome = helper_individual.packchromosome(chromosome) # Convert to a flattened pixel sequence stego_sequence = MatScanner.scan_genetic(stego, chromosome) secret = secret.flatten() stego_sequence = Embedder.embed(stego_sequence, secret, chromosome) # Reshape the stego image return MatScanner.reshape_genetic(stego_sequence, stego.shape, chromosome)
def main(): embedder = Embedder() parser = DocParser() # iterate through grobid with open('grobid_data.pkl', 'wb') as output: for subdir, dirs, files in os.walk(grobid_path): print(len(files)) count = 0 for file in files: print(count) count += 1 # print(os.path.join(subdir, file)) doc = parser.parseXML(os.path.join(subdir, file)) doc.id = str(file).split('.')[0] if len(doc.abstract) == 0: continue doc.embedding = embedder.embed(doc.abstract) # pair = variablesFromPair((doc.abstract, doc.title), word_index, embedding_map) # if (len(pair[0]) == 0 or len(pair[1]) == 0): # continue # doc.embedding = encode(encoder, pair[0]) pickle.dump(doc, output, pickle.HIGHEST_PROTOCOL)
def batch_embed(embedder: Embedder, texts: List[str], batch_size: int) -> np.ndarray: """Get embeddings in batches if GPU memory is not enough. Args: embedder: Embedder with DistilBERT model. texts: List of songs' texts. batch_size: Batch size. Returns: Numpy array of (N, 768) of texts embeddings. """ embeddings = np.zeros((len(texts), 768)) iters = len(texts) // batch_size if len(texts) % batch_size > 0: iters += 1 for i in range(iters): batch = texts[i * batch_size:(i + 1) * batch_size] emb_batch = embedder.embed(batch) embeddings[i * batch_size:(i + 1) * batch_size] = emb_batch return embeddings
biometric = cv2.imread(BIOMETRIC) carrier = cv2.imread(CARRIER) expected = cv2.cvtColor(cv2.imread(EXPECTED), cv2.COLOR_BGR2GRAY) BIOMETRIC_SIZE = biometric.shape[0] actualSize = carrier.shape corners = [[0, 0], [0, 0], [0, 0], [0, 0]] embedded = carrier.copy() em = Embedder(PASSES, KEY, CHANNELS, DEPTH, MASK) ex = Extractor(PASSES, KEY, ~MASK, SHIFT) em_psnr = [] ex_psnr = [] for iteration in range(ITERATIONS): embedded = em.embed(biometric, embedded) filename = './CarrierVsRecoved/Iteration_' + str(iteration) + '.jpg' cv2.imwrite(filename, embedded) embedded = cv2.imread(filename) em_psnr.append(cv2.PSNR(carrier, embedded)) extracted = ex.extract(embedded, corners, actualSize, BIOMETRIC_SIZE) ex_psnr.append(cv2.PSNR(expected, extracted)) print(iteration, em_psnr[-1], ex_psnr[-1]) cv2.imwrite('./CarrierVsRecoved/final.png', extracted) plt.plot(em_psnr, 'b') plt.plot(ex_psnr, 'r') plt.show()
x1 = int(CARRIER_SIZE[0] / 2) - int(CARRIER_SIZE[0] / 4) x2 = int(CARRIER_SIZE[0] / 2) + int(CARRIER_SIZE[0] / 4) y1 = int(CARRIER_SIZE[1] / 2) - int(CARRIER_SIZE[1] / 4) y2 = int(CARRIER_SIZE[1] / 2) + int(CARRIER_SIZE[1] / 4) crop = (x1, y1, x2, y2) untouched = carrier[crop[0]:crop[2], crop[1]:crop[3]] em_psnr = [] ex_psnr = [] for passes in range(MAX_PASSES): em = Embedder(passes, KEY, CHANNELS, DEPTH, MASK) ex = Extractor(passes, KEY, ~MASK, SHIFT) embedded = em.embed(biometric, carrier) cropped = embedded[crop[0]:crop[2], crop[1]:crop[3]] margins = gl.getMargins(cropped, GUIDE_MASK) corners = gl.marginsToCorners(margins) actualSize = gl.getActualSize(cropped, margins) extracted = ex.extract(cropped, corners, actualSize, BIOMETRIC_SIZE) ex_psnr.append(cv2.PSNR(expected, extracted)) em_psnr.append(cv2.PSNR(np.uint8(cropped), np.uint8(untouched))) print(passes, em_psnr[-1], ex_psnr[-1]) cv2.imwrite('./final.png', extracted) plt.plot(em_psnr, 'b') plt.plot(ex_psnr, 'r') plt.show()