def getStat(): directory = "C:\\[Study]\\Diploma\\wiki_indexes\\" accessor = WikiAccessor(directory) bi = WikiBaseIndex(accessor) print('Articles in Wikipedia:' + str(bi.getCount())) pages = getArticles(['Математика', 'Информатика', 'Физика'], accessor) print('Articles in Subset:' + str(len(pages)))
def getStat(): directory = "C:\\WORK\\science\\onpositive_data\\python\\" accessor = WikiAccessor(directory) bi = WikiBaseIndex(accessor) print('Articles in Wikipedia:' + str(bi.getCount())) pages = getArticles(['Математика', 'Информатика', 'Физика'], accessor) print('Articles in Subset:' + str(len(pages)))
def getStatByNouns(): directory = "C:\\[Study]\\Diploma\\wiki_indexes\\" accessor = WikiAccessor(directory) pi = POSListIndex(accessor, 'miph_') for fType in pi.getFunctionalTypes(): print(fType) print("Total nouns: " + str(pi.getTotalNounsCount(fType))) print("Total nouns: " + str(pi.getUniqueNounsCount(fType))) print("Good nouns count: " + str(len(pi.getFunctionalNouns(fType, 0.5))))
def getStatByNouns(): directory = "C:\\WORK\\science\\onpositive_data\\python\\" accessor = WikiAccessor(directory) pi = POSListIndex(accessor, 'miph_') for fType in pi.getFunctionalTypes(): print(fType) print("Total nouns: " + str(pi.getTotalNounsCount(fType))) print("Total nouns: " + str(pi.getUniqueNounsCount(fType))) print("Good nouns count: " + str(len(pi.getFunctionalNouns(fType, 0.5))))
def buildHeaders(categories, prefix): directory = "C:\\[Study]\\Diploma\\wiki_indexes\\" accessor = WikiAccessor(directory) pages = getArticles(categories, accessor) print(len(pages)) hb = HeadersFileBuilder(accessor, list(pages), prefix) hb.build() hi = HeadersFileIndex(accessor, prefix) stat = hi.getAllStat() with codecs.open(directory + 'headers.txt', 'w', 'utf-8') as f: for item in stat: if item['cnt'] == 1: break print(item['text'] + ": " + str(item['cnt'])) f.write(item['text'] + ": " + str(item['cnt']) + '\n') f.close()
def buildHeaders(categories, prefix): directory = "C:\\WORK\\science\\onpositive_data\\python\\" accessor = WikiAccessor(directory) categoryIndex = accessor.getIndex(CategoryIndex) titleIndex = accessor.getIndex(TitleIndex) documentTypes = accessor.getIndex(DocumentTypeIndex) pages = set() for cat in categories: categoryId = categoryIndex.getIdByTitle(cat) catPages = categoryIndex.getAllPagesAsSet(categoryId) pages.update(catPages) with codecs.open(directory + 'titles.txt', 'w', 'utf-8') as f: for p in list(pages): if (documentTypes.isDocType(p, 'person') or documentTypes.isDocType(p, 'location') or documentTypes.isDocType(p, 'entertainment') or documentTypes.isDocType(p, 'organization') or documentTypes.isDocType(p, 'event')): pages.discard(p) else: # print(titleIndex.getTitleById(p)) f.write(titleIndex.getTitleById(p) + '\n') f.close() print(len(pages)) hb = HeadersFileBuilder(accessor, list(pages), prefix) hb.build() hi = HeadersFileIndex(accessor, prefix) stat = hi.getAllStat() with codecs.open(directory + 'headers.txt', 'w', 'utf-8') as f: for item in stat: if item['cnt'] == 1: break print(item['text'] + ": " + str(item['cnt'])) f.write(item['text'] + ": " + str(item['cnt']) + '\n') f.close()
res.append({ 'id': element[0], 'text': element[1], 'cnt': element[2] }) return res if __name__ == "__main__": #regex1 = re.compile('\n[ \t]*==([^=]*)==[ \t\r]*\n') #text = " kdkd\n == kdkd==\n" #match = regex1.search(text) #print(match.end()) from pywikiaccessor.title_index import TitleIndex directory = "C:\\WORK\\science\\onpositive_data\\python\\" accessor = WikiAccessor(directory) docTypesIndex = DocumentTypeIndex(accessor) docIds = docTypesIndex.getDocsOfType("substance") titleIndex = accessor.getIndex(TitleIndex) for docId in docIds: print(titleIndex.getTitleById(docId)) doc_id = titleIndex.getIdByTitle("ALCAM") print(docTypesIndex.getDocTypeById(doc_id)) #hb = HeadersDBBuilder(accessor,list(docIds)) #hb.build() #hb.preProcess() #hb.processDocument(doc_id) #hi = HeadersDBIndex(accessor) #hi.getCountHeadersForDoc(docIds) #stat = hi.getAllStat(docIds) #for s in stat:
def buildPOSList(prefix): directory = "C:\\[Study]\\Diploma\\wiki_indexes\\" accessor = WikiAccessor(directory) sb = POSListBuilder(accessor, prefix) sb.build() sb.printTfIdf()
def buildStat(prefix): directory = "C:\\[Study]\\Diploma\\wiki_indexes\\" accessor = WikiAccessor(directory) sb = StatBuilder(accessor, prefix) sb.build() sb.print()
def buildFragments(prefix): directory = "C:\\[Study]\\Diploma\\wiki_indexes\\" accessor = WikiAccessor(directory) fb = СollocationBuilder(accessor, prefix) fb.build() fb.printFragments(True)
def buildPOSList(prefix): directory = "C:\\WORK\\science\\onpositive_data\\python\\" accessor = WikiAccessor(directory) sb = POSListBuilder(accessor, prefix) sb.build() sb.printTfIdf()
def buildFragments(prefix): directory = "C:\\WORK\\science\\onpositive_data\\python\\" accessor = WikiAccessor(directory) fb = СollocationBuilder(accessor, prefix) fb.build() fb.printFragments(True)