def __init__(self): self.lexicon = Lexicon(DICT_PATH) self.indexedDocs = self.loadIndexedDocs() self.metadata = self.loadMetadata() self.forwardIndexer = ForwardIndexer(self.indexedDocs) self.invertedIndexer = InvertedIndexer()
def main(): lexicon = Lexicon(config.LEXICON_PATH) lexicon.generate_lexicon(list(config.dataset_files(1))) lexicon.generate_lexicon(list(config.dataset_files())) lexicon_dict = lexicon.get_lexicon_dict() print(len(lexicon_dict)) word_id = lexicon.get_word_id("Dear") print(word_id) word_exists = lexicon.get_word_id("blablabla") print(word_exists)
def main(): parser = argparse.ArgumentParser() subparser = parser.add_subparsers(dest='subparser') lexicon_argparser = subparser.add_parser("generate_lexicon") lexicon_argparser.add_argument('--b_range', type=str, help="Batches numbers range start and end creating/updating lexicon from. For example 1,3") lexicon_argparser.add_argument('--d', type=int, default=0, help="Print demo results.") forward_index_argparser = subparser.add_parser("generate_forward_index") forward_index_argparser.add_argument('--b_range', type=str, help="Batches numbers range start and end creating/updating forward index from. For example 1,3") forward_index_argparser.add_argument('--d', type=int, default=0, help="Print demo results.") inverted_index_argparser = subparser.add_parser("generate_inverted_index") inverted_index_argparser.add_argument('--b', type=str, help="Forward Index Batches to create inverted_index from. Comma Separated.") inverted_index_argparser.add_argument('--d', type=int, default=0, help="Print demo results.") search = subparser.add_parser("search") search.add_argument("--q", type=str, help="Search Query.") args = parser.parse_args() if args.subparser == 'generate_lexicon': batche_range = list(map(int, args.b_range.split(","))) generate_lexicon.main(*batche_range, demo=args.d) elif args.subparser == 'generate_forward_index': batche_range = list(map(int, args.b_range.split(","))) generate_forward_index.main(*batche_range, demo=args.d) elif args.subparser == 'generate_inverted_index': batches = args.b.split(',') generate_inverted_index.main(batches, demo=args.d) elif args.subparser == 'search': lexicon = Lexicon(config.LEXICON_PATH) inverted_index = InvertedIndex(config.INVERTED_INDEX_BARRELS_PATH, config.INVERTED_INDEX_BARRELS_TEMP_PATH, len(lexicon), config.INVERTED_INDEX_BARREL_SIZE) search = Search(lexicon, inverted_index) print(search.search(args.q))
def main(batch_start, batch_end, demo=False): lexicon = Lexicon(config.LEXICON_PATH) lexicon.generate_lexicon(config.dataset_files(batch_start, batch_end)) print(f"Lexicon created with {len(lexicon)} words.") print('-' * 32) if not demo: return ### DEMO PRINTING ### PRINT_N = 10 print("### DEMO TEST ###") print(f"{PRINT_N} words from the lexicon are: ") lexicon_dict = lexicon.get_lexicon_dict() for i, word in enumerate(lexicon_dict): if i >= PRINT_N: break print(f"\t{word}: {lexicon_dict[word]}") print('-' * 32)
def main(batch_start, batch_end, demo=False): lexicon = Lexicon(config.LEXICON_PATH) forward_index = ForwardIndex(config.FORWARD_INDEX_BARRELS_PATH, lexicon) with concurrent.futures.ThreadPoolExecutor() as executor: threads = [] if batch_start == batch_end: batch_1_thread = executor.submit(forward_index.add_to_forward_index, config.dataset_files(batch_start, batch_start + 1), f"batch_00{batch_start}") threads.append(batch_1_thread) else: mid = int((batch_end + batch_start) / 2) batch_1_thread = executor.submit(forward_index.add_to_forward_index, config.dataset_files(batch_start, mid), f"batch_00{batch_start}") batch_2_thread = executor.submit(forward_index.add_to_forward_index, config.dataset_files(mid, batch_end), f"batch_00{mid}") threads.append(batch_1_thread) threads.append(batch_2_thread) for f in concurrent.futures.as_completed(threads): print(f"{f.result()} forward_index created.") if not demo: return ### DEMO PRINTING ### print('-'*32) PRINT_BARREL = 0 PRINT_N = 2 print("### DEMO TEST ###") print(f"{PRINT_N} entrie(s) from barrel {PRINT_BARREL}:") with open(os.path.join(config.FORWARD_INDEX_BARRELS_PATH, f"batch_00{PRINT_BARREL}"), 'rb') as forward_index_file: forward_index = pickle.load(forward_index_file) for i, doc_id in enumerate(forward_index): if i >= PRINT_N: break print(f"\t{doc_id}:") for word_id in forward_index[doc_id]: print(f"\t\t{word_id}: {forward_index[doc_id][word_id]}") print('-'*32)
def main(forward_index_batches, demo=False): lexicon = Lexicon(config.LEXICON_PATH) inverted_index = InvertedIndex(config.INVERTED_INDEX_BARRELS_PATH, config.INVERTED_INDEX_BARRELS_TEMP_PATH, len(lexicon), config.INVERTED_INDEX_BARREL_SIZE) with concurrent.futures.ThreadPoolExecutor() as executor: threads = [] for fib in forward_index_batches: thread = executor.submit( inverted_index.invert_forward_index, os.path.join(config.FORWARD_INDEX_BARRELS_PATH, fib)) for f in concurrent.futures.as_completed(threads): print(f"{f.result()} created.") inverted_index.merge_buckets() if not demo: return ### DEMO PRINTING ### print('-' * 32) PRINT_BARREL = 3 PRINT_N = 30 print("### DEMO TEST ###") print(f"{PRINT_N} entries from barrel {PRINT_BARREL}:") with open( os.path.join(config.INVERTED_INDEX_BARRELS_PATH, f"{PRINT_BARREL:03}_inverted"), 'rb') as inverted_index_file: inverted_index = pickle.load(inverted_index_file) for i, word_id in enumerate(inverted_index): if i >= PRINT_N: break print(f"\t{word_id}:") for doc in inverted_index[word_id]: print(f"\t\t{doc}: {inverted_index[word_id][doc]}")
def main(): lexicon = Lexicon(config.LEXICON_PATH) forward_index = ForwardIndex(config.FORWARD_INDEX_BARRELS_PATH, lexicon) forward_index.add_to_forward_index(list(config.dataset_files(0, 1)), 'aftab_test_forward_1')
import os import json import config from indexing.lexicon import Lexicon from indexing.inverted_index import InvertedIndex from search.search import Search # flask app & Api app = Flask(__name__) api = Api(app) cors = CORS(app) app.config['CORS_HEADERS'] = 'Content-Type' # Indexes lexicon = Lexicon(config.LEXICON_PATH) inverted_index = InvertedIndex(config.INVERTED_INDEX_BARRELS_PATH, config.INVERTED_INDEX_BARRELS_TEMP_PATH, len(lexicon), config.INVERTED_INDEX_BARREL_SIZE) search = Search(lexicon, inverted_index) # for handling searches class Setup(Resource): @cross_origin() def get(self): return render_template('index.html') class Document(Resource): def get(self, doc_id):
class Indexer: """docstring for Indexer""" def __init__(self): self.lexicon = Lexicon(DICT_PATH) self.indexedDocs = self.loadIndexedDocs() self.metadata = self.loadMetadata() self.forwardIndexer = ForwardIndexer(self.indexedDocs) self.invertedIndexer = InvertedIndexer() def addFile(self, dictDir, file): """ arguments: - dictDir: the path of the directory containing the dictionaries for the forward and the inverted index - file: the path to the file that is to be added This function updates the lexicon to accommodate the new file and adds the file to the forward and inverted indexes. return: None """ # if document is already indexed, return if self.indexedDocs.get(file[-21:]) is not None: print(datetime.now(), "Document already present in index.") return print(file) print(datetime.now(), "Adding document to index.") # get author name, title, all texts, url, weightedShares and file path of given file author, title, tokens, url, shares, filepath = readFile(file) # clean the texts for short and long barreling shortTokens = clean(author + " " + title) tokens = clean(tokens) # add tokens to lexicon self.lexicon.processFile(tokens) self.lexicon.processFile(shortTokens) # get unique, sorted wordIDs present in the file wordIDs = sorted(set([self.lexicon.lexDict[token] for token in tokens])) shortWordIDs = sorted( set([self.lexicon.lexDict[token] for token in shortTokens])) # get all barrels that are to be updated barrels = sorted(set([getBarrel(wordID) for wordID in wordIDs])) shortBarrels = sorted( set([getBarrel(wordID) for wordID in shortWordIDs])) # add data to long and short forward barrels shortForwardBarrels, _ = self.forwardIndexer.addFile(dictDir, self.lexicon, shortTokens, shortBarrels, short=True) forwardBarrels, docID = self.forwardIndexer.addFile( dictDir, self.lexicon, tokens, barrels) # add data to long and short inverted barrels self.invertedIndexer.addFile(dictDir, shortWordIDs, docID, shortBarrels, shortForwardBarrels, short=True) self.invertedIndexer.addFile(dictDir, wordIDs, docID, barrels, forwardBarrels) print(datetime.now(), "Document added to index.") # add documentID into indexedDocs so it is not indexed again self.indexedDocs[file[-21:]] = docID # store document's metadata self.addMetadata(docID, author, title, url, shares, filepath) print(docID) forwardBarrels.clear() def indexDataset(self): """ This function will iterate over the dataset provided in DATASET_PATH and will index it. The indexes and lexicon will be written to the DICT_PATH directory. return: None """ shortForwardBarrels = dict() forwardBarrels = dict() print(datetime.now(), "Generating lexicon and forward index.") for folder in os.listdir(DATA_PATH): FILE_PATH = DATA_PATH + '/' + folder for file in tqdm(os.listdir(FILE_PATH)): path = FILE_PATH + '/' + file # make sure document is not already indexed if self.indexedDocs.get(path[-21:]) is not None: continue # get author name, title, all texts, url, weightedShares and file path of given file author, title, tokens, url, shares, filepath = readFile(path) # make tokens for long and short barreling shortTokens = clean(author + " " + title) tokens = clean(tokens) # add tokens to lexicon self.lexicon.processFile(shortTokens) self.lexicon.processFile(tokens) # index short barrels self.forwardIndexer.processFile(self.lexicon, shortForwardBarrels, shortTokens, short=True) # index long barrels self.forwardIndexer.processFile(self.lexicon, forwardBarrels, tokens) # record that document has been indexed self.indexedDocs[path[-21:]] = self.forwardIndexer.docID - 1 # store document's metadata self.addMetadata(self.forwardIndexer.docID - 1, author, title, url, shares, filepath) # dump short barrels print(datetime.now(), "Writing short forward index to file.") self.forwardIndexer.dump(DICT_PATH, shortForwardBarrels, overwrite=False, short=True) # dump long barrels print(datetime.now(), "Writing long forward index to file.") self.forwardIndexer.dump(DICT_PATH, forwardBarrels, overwrite=False) forwardBarrels.clear() # invert short barrels print(datetime.now(), "Generating short inverted index.") for file in os.listdir(os.path.join(DICT_PATH, 'short_forward_barrels')): self.invertedIndexer.processFile(DICT_PATH, file, int(file[8:-5]), short=True) # invert long barrels print(datetime.now(), "Generating long inverted index.") for file in os.listdir(os.path.join(DICT_PATH, 'forward_barrels')): self.invertedIndexer.processFile(DICT_PATH, file, int(file[8:-5])) print(datetime.now(), "Indexing complete.") def addMetadata(self, docID, author, title, url, shares, filepath): # store arguments in metadata dictionary self.metadata[str(docID)] = [title, author, url, shares, filepath] def loadIndexedDocs(self): # load and return indexedDocs try: with open(os.path.join(DICT_PATH, 'indexed_docs.json'), 'r', encoding="utf8") as f: indexedDocs = json.load(f) except FileNotFoundError: indexedDocs = dict() return indexedDocs def loadMetadata(self): # load and return metadata try: with open(os.path.join(DICT_PATH, 'metadata.json'), 'r', encoding="utf8") as f: metadata = json.load(f) except FileNotFoundError: metadata = dict() return metadata