예제 #1
0
def buildHeaders(categories, prefix):
    directory = "C:\\WORK\\science\\onpositive_data\\python\\"
    accessor = WikiAccessor(directory)
    categoryIndex = accessor.getIndex(CategoryIndex)
    titleIndex = accessor.getIndex(TitleIndex)
    documentTypes = accessor.getIndex(DocumentTypeIndex)

    pages = set()
    for cat in categories:
        categoryId = categoryIndex.getIdByTitle(cat)
        catPages = categoryIndex.getAllPagesAsSet(categoryId)
        pages.update(catPages)
    with codecs.open(directory + 'titles.txt', 'w', 'utf-8') as f:
        for p in list(pages):
            if (documentTypes.isDocType(p, 'person')
                    or documentTypes.isDocType(p, 'location')
                    or documentTypes.isDocType(p, 'entertainment')
                    or documentTypes.isDocType(p, 'organization')
                    or documentTypes.isDocType(p, 'event')):
                pages.discard(p)
            else:
                # print(titleIndex.getTitleById(p))
                f.write(titleIndex.getTitleById(p) + '\n')
        f.close()
    print(len(pages))
    hb = HeadersFileBuilder(accessor, list(pages), prefix)
    hb.build()
    hi = HeadersFileIndex(accessor, prefix)
    stat = hi.getAllStat()
    with codecs.open(directory + 'headers.txt', 'w', 'utf-8') as f:
        for item in stat:
            if item['cnt'] == 1:
                break
            print(item['text'] + ": " + str(item['cnt']))
            f.write(item['text'] + ": " + str(item['cnt']) + '\n')
        f.close()
예제 #2
0
                'text': element[1],
                'cnt': element[2]
            })
        return res


if __name__ == "__main__":
    #regex1 = re.compile('\n[ \t]*==([^=]*)==[ \t\r]*\n')
    #text = " kdkd\n == kdkd==\n"
    #match = regex1.search(text)
    #print(match.end())
    from pywikiaccessor.title_index import TitleIndex
    directory = "C:\\WORK\\science\\onpositive_data\\python\\"
    accessor = WikiAccessor(directory)
    docTypesIndex = DocumentTypeIndex(accessor)
    docIds = docTypesIndex.getDocsOfType("substance")
    titleIndex = accessor.getIndex(TitleIndex)
    for docId in docIds:
        print(titleIndex.getTitleById(docId))
    doc_id = titleIndex.getIdByTitle("ALCAM")
    print(docTypesIndex.getDocTypeById(doc_id))
#hb = HeadersDBBuilder(accessor,list(docIds))
#hb.build()
#hb.preProcess()
#hb.processDocument(doc_id)
#hi = HeadersDBIndex(accessor)
#hi.getCountHeadersForDoc(docIds)
#stat = hi.getAllStat(docIds)
#for s in stat:
#    print (s['text']+": "+str(s['cnt']))