def testTokenize(self): """ Test tokenize """ self.assertEqual(Tokenizer.tokenize("Y this is a test!"), ["test"]) self.assertEqual(Tokenizer.tokenize("abc123 ABC 123"), ["abc123", "abc"])
def stream(self, dbfile): """ Connects to SQLite file at dbfile and yields parsed tokens for each row. Args: dbfile: """ # Connection to database file db = sqlite3.connect(dbfile) cur = db.cursor() cur.execute("SELECT Text FROM sections") count = 0 for section in cur: # Tokenize text tokens = Tokenizer.tokenize(section[0]) count += 1 if count % 1000 == 0: print("Streamed %d documents" % (count), end="\r") # Skip documents with no tokens parsed if tokens: yield tokens print("Iterated over %d total rows" % (count)) # Free database resources db.close()
def search(embeddings, cur, query, topn, threshold): """ Executes an embeddings search for the input query. Each returned result is resolved to the full section row. Args: embeddings: embeddings model cur: database cursor query: query text topn: number of documents to return threshold: require at least this score to include result Returns: search results """ if query == "*": return [] # Default threshold if None threshold = threshold if threshold is not None else 0.6 results = [] # Get list of required and prohibited tokens must = [token.strip("+") for token in query.split() if token.startswith("+") and len(token) > 1] mnot = [token.strip("-") for token in query.split() if token.startswith("-") and len(token) > 1] # Tokenize search query query = Tokenizer.tokenize(query) # Retrieve topn * 5 to account for duplicate matches for uid, score in embeddings.search(query, topn * 5): if score >= threshold: cur.execute("SELECT Article, Text FROM sections WHERE id = ?", [uid]) # Get matching row sid, text = cur.fetchone() # Add result if: # - all required tokens are present or there are not required tokens AND # - all prohibited tokens are not present or there are not prohibited tokens if (not must or all([token.lower() in text.lower() for token in must])) and ( not mnot or all([token.lower() not in text.lower() for token in mnot]) ): # Save result results.append((uid, score, sid, text)) return results
def tokenize(text): """ Tokenizes text into tokens, removes domain specific stop words. Args: text: input text Returns: tokens """ # Remove additional stop words to improve highlighting results return { token for token in Tokenizer.tokenize(text) if token not in Highlights.STOP_WORDS }
def stream(dbfile, maxsize): """ Streams documents from an articles.sqlite file. This method is a generator and will yield a row at time. Args: dbfile: input SQLite file maxsize: maximum number of documents to process """ # Connection to database file db = sqlite3.connect(dbfile) cur = db.cursor() # Select tagged sentences without a NLP label. NLP labels are set for non-informative sentences. query = Index.SECTION_QUERY + " AND tags is not null" if maxsize > 0: query += " AND article in (SELECT id FROM articles ORDER BY entry DESC LIMIT %d)" % maxsize # Run the query cur.execute(query) count = 0 for row in cur: # Unpack row uid, name, text = row if not name or not re.search(Index.SECTION_FILTER, name.lower()): # Tokenize text tokens = Tokenizer.tokenize(text) document = (uid, tokens, None) count += 1 if count % 1000 == 0: print("Streamed %d documents" % (count), end="\r") # Skip documents with no tokens parsed if tokens: yield document print("Iterated over %d total rows" % (count)) # Free database resources db.close()
def search(embeddings, cur, query, topn): """ Executes an embeddings search for the input query. Each returned result is resolved to the full section row. Args: embeddings: embeddings model cur: database cursor query: query text topn: number of documents to return Returns: search results """ if query == "*": return [] results = [] # Get list of required tokens must = [ token.strip("+") for token in query.split() if token.startswith("+") ] # Tokenize search query query = Tokenizer.tokenize(query) # Retrieve topn * 5 to account for duplicate matches for uid, score in embeddings.search(query, topn * 5): if score >= 0.6: cur.execute("SELECT Article, Text FROM sections WHERE id = ?", [uid]) # Get matching row sid, text = cur.fetchone() # Add result if all required tokens are present or there are not required tokens if not must or all( [token.lower() in text.lower() for token in must]): # Save result results.append((uid, score, sid, text)) return results
mode = sys.argv[2] index_name = 'index' with open(input_file, 'r') as infile: sections = infile.readlines() # Create an index for the list of sections doc_dict = {} index_text = [] for uid, text in enumerate(sections): doc_dict[uid] = text.split('\t') session_id, raw_text = doc_dict[uid][:2] if len(raw_text) > 250: raw_text = Tokenizer.tokenize(raw_text) index_text.append((uid, raw_text, None)) if mode == 'index': print("--indexing-- %d documents" % (len(index_text))) embeddings.index(index_text) embeddings.save(index_name) elif mode == 'search': print("--searching-- %d documents" % (len(index_text))) embeddings.load(index_name) for query in ("the milestones for our seed round", "what is possible today", "My philosophy has always been don't solve the human", "story about Larry", "biological memory", "short-term memory", "memory blocks and memory stack", "the company where i programmed robots", "nothing to do with us"): # Extract uid of first result # search result format: (uid, score) print(query) for i in range(0, 3): uid = embeddings.search(query, 3)[i][0]