def buildHeaders(categories, prefix): directory = "C:\\WORK\\science\\onpositive_data\\python\\" accessor = WikiAccessor(directory) categoryIndex = accessor.getIndex(CategoryIndex) titleIndex = accessor.getIndex(TitleIndex) documentTypes = accessor.getIndex(DocumentTypeIndex) pages = set() for cat in categories: categoryId = categoryIndex.getIdByTitle(cat) catPages = categoryIndex.getAllPagesAsSet(categoryId) pages.update(catPages) with codecs.open(directory + 'titles.txt', 'w', 'utf-8') as f: for p in list(pages): if (documentTypes.isDocType(p, 'person') or documentTypes.isDocType(p, 'location') or documentTypes.isDocType(p, 'entertainment') or documentTypes.isDocType(p, 'organization') or documentTypes.isDocType(p, 'event')): pages.discard(p) else: # print(titleIndex.getTitleById(p)) f.write(titleIndex.getTitleById(p) + '\n') f.close() print(len(pages)) hb = HeadersFileBuilder(accessor, list(pages), prefix) hb.build() hi = HeadersFileIndex(accessor, prefix) stat = hi.getAllStat() with codecs.open(directory + 'headers.txt', 'w', 'utf-8') as f: for item in stat: if item['cnt'] == 1: break print(item['text'] + ": " + str(item['cnt'])) f.write(item['text'] + ": " + str(item['cnt']) + '\n') f.close()
'text': element[1], 'cnt': element[2] }) return res if __name__ == "__main__": #regex1 = re.compile('\n[ \t]*==([^=]*)==[ \t\r]*\n') #text = " kdkd\n == kdkd==\n" #match = regex1.search(text) #print(match.end()) from pywikiaccessor.title_index import TitleIndex directory = "C:\\WORK\\science\\onpositive_data\\python\\" accessor = WikiAccessor(directory) docTypesIndex = DocumentTypeIndex(accessor) docIds = docTypesIndex.getDocsOfType("substance") titleIndex = accessor.getIndex(TitleIndex) for docId in docIds: print(titleIndex.getTitleById(docId)) doc_id = titleIndex.getIdByTitle("ALCAM") print(docTypesIndex.getDocTypeById(doc_id)) #hb = HeadersDBBuilder(accessor,list(docIds)) #hb.build() #hb.preProcess() #hb.processDocument(doc_id) #hi = HeadersDBIndex(accessor) #hi.getCountHeadersForDoc(docIds) #stat = hi.getAllStat(docIds) #for s in stat: # print (s['text']+": "+str(s['cnt']))