示例#1
0
 def build_locdic_from_outfile(filename, parser=SimpleWordParser(),
                               min_word_docs_frac=0, max_word_docs_frac=0.2, min_word_count_frac=0, max_word_count_frac=0.01,
                               doc_name_weight=0):
     locdic = LocationDictionary(save_locations=False, doc_name_weight=doc_name_weight)
     locdic.set_search_word_filter(min_word_docs_frac=min_word_docs_frac, max_word_docs_frac=max_word_docs_frac,
                                   min_word_count_frac=min_word_count_frac, max_word_count_frac=max_word_count_frac)
     num_pages, num_sections = 0, 0
     page_name, section_name = None, None
     num_lines = 0
     if type(filename)==str:
         assert file is not None
         filenames = [filename]
     else:
         assert not np.any([(fn is None) for fn in filename])
         filenames = filename # list of file names
     for ifname,fname in enumerate(filenames):
         print 'Building locdic from file #%d: %s' % (ifname, fname)
         with open(fname,'rt') as infile:
             for text in infile:
                 if len(text)==0:
                     print 'Reached EOF'
                     break # EOF
                 if text.startswith(CorpusReader.PAGE_NAME_PREFIX):
                     page_name = text[len(CorpusReader.PAGE_NAME_PREFIX):].strip()
                     section_name = None
                     num_pages += 1
                 elif text.startswith(CorpusReader.SECTION_NAME_PREFIX):
                     section_name = text[len(CorpusReader.SECTION_NAME_PREFIX):].strip()
                     num_sections += 1
                 else:
                     assert (page_name is not None) and (section_name is not None)
                     section_words = parser.parse(text, calc_weights=False)
                     locdic.add_words('F%d/%s/%s' % (ifname, page_name, section_name), CorpusReader.words_from_part_name(page_name) + CorpusReader.words_from_part_name(section_name),
                                      section_words)
                 num_lines += 1
                 if num_lines % 100000 == 0:
                     print ' read %d lines: %d pages, %d sections -> %d words' % (num_lines, num_pages, num_sections, len(locdic.word_ids))
     return locdic