def data_generator(index_dir, window_size, include_stop_words=False): """Given a directory and a window size outputs a list of (sentence, number of men on screen, number of women on screen, mean number of men on screen, mean number of women on screen, channel) sentence can be with or without stopwords """ # Open the transcript files doc_path = os.path.join(index_dir, 'docs.list') lex_path = os.path.join(index_dir, 'words.lex') idx_path = os.path.join(index_dir, 'index.bin') channel = 'MSNBC' var = {'CNN':(1, 82529), 'FOX': (82530, 162639), 'MSNBC': (162640, 246922)} SIZE = 20000 documents = Documents.load(doc_path) lexicon = Lexicon.load(lex_path) # Getting words words = get_lexicon()
def main(index_dir, query, silent, context_size): idx_path = os.path.join(index_dir, 'index.bin') doc_path = os.path.join(index_dir, 'documents.txt') data_path = os.path.join(index_dir, 'data') lex_path = os.path.join(index_dir, 'lexicon.txt') documents = Documents.load(doc_path) documents.configure(data_path) lexicon = Lexicon.load(lex_path) with CaptionIndex(idx_path, lexicon, documents) as index: if len(query) > 0: print('Query: ', query) run_search(' '.join(query), documents, lexicon, index, context_size, silent) else: print('Enter a query:') while True: try: query = input('> ') except (EOFError, KeyboardInterrupt): print() break query = query.strip() if len(query) > 0: try: run_search(query, documents, lexicon, index, context_size, silent) except: traceback.print_exc()
def main(index_dir, workers, limit): doc_path = os.path.join(index_dir, 'documents.txt') documents = Documents.load(doc_path) if limit is None: limit = len(documents) start_time = time.time() with Pool(processes=workers, initializer=init_worker, initargs=(count_tokens, index_dir)) as pool: count = 0 for n in tqdm(pool.imap_unordered(count_tokens, range(limit)), desc='Counting tokens', total=limit): count += n print('Scanned {} documents for {} tokens in {:d}ms'.format( limit, count, int(1000 * (time.time() - start_time))))
def init_worker(function, index_dir): doc_path = os.path.join(index_dir, 'documents.txt') data_dir = os.path.join(index_dir, 'data') function.documents = Documents.load(doc_path) function.documents.configure(data_dir)
from captions import Documents, Lexicon, CaptionIndex, MetadataIndex INDEX_DIR = '/app/data/index' DOCUMENTS_PATH = os.path.join(INDEX_DIR, 'docs.list') LEXICON_PATH = os.path.join(INDEX_DIR, 'words.lex') INDEX_PATH = os.path.join(INDEX_DIR, 'index.bin') # METADATA_PATH = os.path.join(INDEX_DIR, 'meta.bin') print('Loading the document list and lexicon', file=sys.stderr) try: DOCUMENTS LEXICON INDEX except NameError: DOCUMENTS = Documents.load(DOCUMENTS_PATH) LEXICON = Lexicon.load(LEXICON_PATH) INDEX = CaptionIndex(INDEX_PATH, LEXICON, DOCUMENTS) def is_word_in_lexicon(word): return word in LEXICON def _get_video_name(p): """Only the filename without exts""" return Path(p).name.split('.')[0] def _init_doc_id_to_vid_id(): video_ids = [v.id for v in Video.objects.all()]
def main( index_dir: str, new_doc_dir: Optional[str], chunk_size: Optional[int] = None, skip_existing_names: bool = False ): assert chunk_size is None or chunk_size > 0 doc_path = os.path.join(index_dir, 'documents.txt') lex_path = os.path.join(index_dir, 'lexicon.txt') index_path = os.path.join(index_dir, 'index.bin') old_lexicon = Lexicon.load(lex_path) documents = Documents.load(doc_path) if new_doc_dir: new_docs_to_index = list_docs(new_doc_dir) else: new_docs_to_index = read_docs_from_stdin() assert len(new_docs_to_index) > 0 tmp_new_docs_to_index = [] for new_doc in new_docs_to_index: if new_doc.name in documents: if skip_existing_names: print('Skipping: {} is already indexed!'.format(new_doc.name)) else: raise Exception( '{} is already indexed! Aborting.'.format(new_doc.name)) else: tmp_new_docs_to_index.append(new_doc) new_docs_to_index = tmp_new_docs_to_index if len(new_docs_to_index) == 0: print('No new documents to index.') return # Update lexicon new_word_counts = get_word_counts(new_docs_to_index) lexicon_words = [ Lexicon.Word(w.id, w.token, w.count + new_word_counts[w.token] if w.token in new_word_counts else w.count) for w in old_lexicon ] for w in new_word_counts: if w not in old_lexicon: lexicon_words.append( Lexicon.Word(len(lexicon_words), w, new_word_counts[w])) lexicon = Lexicon(lexicon_words) base_doc_id = len(documents) new_documents = [Documents.Document(id=i + base_doc_id, name=d.name) for i, d in enumerate(new_docs_to_index)] # Convert existing index.bin to a dirctory if needed if os.path.isfile(index_path): tmp_index_path = index_path + '.tmp' shutil.move(index_path, tmp_index_path) os.makedirs(index_path) shutil.move( tmp_index_path, os.path.join(index_path, '{:07d}-{:07d}.bin'.format( 0, base_doc_id))) assert os.path.isdir(index_path) # Index the new documents index_new_docs(new_docs_to_index, new_documents, lexicon, index_path, os.path.join(index_dir, 'data'), chunk_size) # Write out the new documents file shutil.move(doc_path, doc_path + '.old') all_documents = list(documents) all_documents.extend(new_documents) Documents(all_documents).store(doc_path) # Update to the new lexicon lexicon.store(lex_path) print('Done!')
def main(index_dir, silent, context_size, folder, use_gender): doc_path = os.path.join(index_dir, 'docs.list') lex_path = os.path.join(index_dir, 'words.lex') idx_path = os.path.join(index_dir, 'index.bin') documents = Documents.load(doc_path) lexicon = Lexicon.load(lex_path) words = get_lexicon() stop_words = set( list(STOP_WORDS) + [ "know", "don", "ve", "say", "way", "said", "ll", "think", "thing", "don’t", "like", "got", "people", "going", "talk", "right", "happened", ">>" ]) print("Stop words", stop_words) doc_idxs = range(144, 246923) word_idx_dic = {} idx_counter = 0 # Create folder if not os.path.exists(folder): os.makedirs(folder) # Create stemmer stemmer = WordNetLemmatizer() with CaptionIndex(idx_path, lexicon, documents) as index: for doc_id in tqdm.tqdm(doc_idxs): dic = {} count = 1 if use_gender: intervals_gender = gender_to_time(str(doc_id), gender_reqs) postings = [] for t1, t2 in intervals_gender: postings.extend(index.intervals(int(doc_id), t1, t2)) else: postings = index.intervals(int(doc_id)) starttime = None for p in postings: if starttime is None: starttime = p.start # Cut after 30s if p.end - starttime > 30 * count: pickle.dump( dic, open( os.path.join( folder, 'Doc_%d_Chunk_%d.p' % (doc_id, count - 1)), 'wb')) dic = {} count += 1 starttime = p.end # Get words in posting tokens = index.tokens(0, p.idx, p.len) if not tokens: continue for token in tokens: word = words[token] # stemmed_word = stemmer.stem(word) if word not in stop_words and len(word) > 1: stemmed_word = stemmer.lemmatize(word) # print("Word {} -> {}".format(word, stemmed_word)) if stemmed_word not in word_idx_dic.keys(): word_idx_dic[stemmed_word] = idx_counter idx_counter += 1 idx_token = word_idx_dic[stemmed_word] if idx_token in dic: dic[idx_token] += 1 else: dic[idx_token] = 1 pickle.dump(word_idx_dic, open(os.path.join(folder, "word_idx.p"), "wb"))