def build_locdic_from_outfile(filename, parser=SimpleWordParser(), min_word_docs_frac=0, max_word_docs_frac=0.2, min_word_count_frac=0, max_word_count_frac=0.01, doc_name_weight=0): locdic = LocationDictionary(save_locations=False, doc_name_weight=doc_name_weight) locdic.set_search_word_filter(min_word_docs_frac=min_word_docs_frac, max_word_docs_frac=max_word_docs_frac, min_word_count_frac=min_word_count_frac, max_word_count_frac=max_word_count_frac) num_pages, num_sections = 0, 0 page_name, section_name = None, None num_lines = 0 if type(filename)==str: assert file is not None filenames = [filename] else: assert not np.any([(fn is None) for fn in filename]) filenames = filename # list of file names for ifname,fname in enumerate(filenames): print 'Building locdic from file #%d: %s' % (ifname, fname) with open(fname,'rt') as infile: for text in infile: if len(text)==0: print 'Reached EOF' break # EOF if text.startswith(CorpusReader.PAGE_NAME_PREFIX): page_name = text[len(CorpusReader.PAGE_NAME_PREFIX):].strip() section_name = None num_pages += 1 elif text.startswith(CorpusReader.SECTION_NAME_PREFIX): section_name = text[len(CorpusReader.SECTION_NAME_PREFIX):].strip() num_sections += 1 else: assert (page_name is not None) and (section_name is not None) section_words = parser.parse(text, calc_weights=False) locdic.add_words('F%d/%s/%s' % (ifname, page_name, section_name), CorpusReader.words_from_part_name(page_name) + CorpusReader.words_from_part_name(section_name), section_words) num_lines += 1 if num_lines % 100000 == 0: print ' read %d lines: %d pages, %d sections -> %d words' % (num_lines, num_pages, num_sections, len(locdic.word_ids)) return locdic