Exemplo n.º 1
0
def getStat():
    directory = "C:\\[Study]\\Diploma\\wiki_indexes\\"
    accessor = WikiAccessor(directory)
    bi = WikiBaseIndex(accessor)
    print('Articles in Wikipedia:' + str(bi.getCount()))
    pages = getArticles(['Математика', 'Информатика', 'Физика'], accessor)
    print('Articles in Subset:' + str(len(pages)))
Exemplo n.º 2
0
def getStat():
    directory = "C:\\WORK\\science\\onpositive_data\\python\\"
    accessor = WikiAccessor(directory)
    bi = WikiBaseIndex(accessor)
    print('Articles in Wikipedia:' + str(bi.getCount()))
    pages = getArticles(['Математика', 'Информатика', 'Физика'], accessor)
    print('Articles in Subset:' + str(len(pages)))
Exemplo n.º 3
0
def getStatByNouns():
    directory = "C:\\[Study]\\Diploma\\wiki_indexes\\"
    accessor = WikiAccessor(directory)
    pi = POSListIndex(accessor, 'miph_')
    for fType in pi.getFunctionalTypes():
        print(fType)
        print("Total nouns: " + str(pi.getTotalNounsCount(fType)))
        print("Total nouns: " + str(pi.getUniqueNounsCount(fType)))
        print("Good nouns count: " +
              str(len(pi.getFunctionalNouns(fType, 0.5))))
Exemplo n.º 4
0
def getStatByNouns():
    directory = "C:\\WORK\\science\\onpositive_data\\python\\"
    accessor = WikiAccessor(directory)
    pi = POSListIndex(accessor, 'miph_')
    for fType in pi.getFunctionalTypes():
        print(fType)
        print("Total nouns: " + str(pi.getTotalNounsCount(fType)))
        print("Total nouns: " + str(pi.getUniqueNounsCount(fType)))
        print("Good nouns count: " +
              str(len(pi.getFunctionalNouns(fType, 0.5))))
Exemplo n.º 5
0
def buildHeaders(categories, prefix):
    directory = "C:\\WORK\\science\\onpositive_data\\python\\"
    accessor = WikiAccessor(directory)
    categoryIndex = accessor.getIndex(CategoryIndex)
    titleIndex = accessor.getIndex(TitleIndex)
    documentTypes = accessor.getIndex(DocumentTypeIndex)

    pages = set()
    for cat in categories:
        categoryId = categoryIndex.getIdByTitle(cat)
        catPages = categoryIndex.getAllPagesAsSet(categoryId)
        pages.update(catPages)
    with codecs.open(directory + 'titles.txt', 'w', 'utf-8') as f:
        for p in list(pages):
            if (documentTypes.isDocType(p, 'person')
                    or documentTypes.isDocType(p, 'location')
                    or documentTypes.isDocType(p, 'entertainment')
                    or documentTypes.isDocType(p, 'organization')
                    or documentTypes.isDocType(p, 'event')):
                pages.discard(p)
            else:
                # print(titleIndex.getTitleById(p))
                f.write(titleIndex.getTitleById(p) + '\n')
        f.close()
    print(len(pages))
    hb = HeadersFileBuilder(accessor, list(pages), prefix)
    hb.build()
    hi = HeadersFileIndex(accessor, prefix)
    stat = hi.getAllStat()
    with codecs.open(directory + 'headers.txt', 'w', 'utf-8') as f:
        for item in stat:
            if item['cnt'] == 1:
                break
            print(item['text'] + ": " + str(item['cnt']))
            f.write(item['text'] + ": " + str(item['cnt']) + '\n')
        f.close()
Exemplo n.º 6
0
def buildHeaders(categories, prefix):
    directory = "C:\\[Study]\\Diploma\\wiki_indexes\\"
    accessor = WikiAccessor(directory)
    pages = getArticles(categories, accessor)
    print(len(pages))
    hb = HeadersFileBuilder(accessor, list(pages), prefix)
    hb.build()
    hi = HeadersFileIndex(accessor, prefix)
    stat = hi.getAllStat()
    with codecs.open(directory + 'headers.txt', 'w', 'utf-8') as f:
        for item in stat:
            if item['cnt'] == 1:
                break
            print(item['text'] + ": " + str(item['cnt']))
            f.write(item['text'] + ": " + str(item['cnt']) + '\n')
        f.close()
Exemplo n.º 7
0
            res.append({
                'id': element[0],
                'text': element[1],
                'cnt': element[2]
            })
        return res


if __name__ == "__main__":
    #regex1 = re.compile('\n[ \t]*==([^=]*)==[ \t\r]*\n')
    #text = " kdkd\n == kdkd==\n"
    #match = regex1.search(text)
    #print(match.end())
    from pywikiaccessor.title_index import TitleIndex
    directory = "C:\\WORK\\science\\onpositive_data\\python\\"
    accessor = WikiAccessor(directory)
    docTypesIndex = DocumentTypeIndex(accessor)
    docIds = docTypesIndex.getDocsOfType("substance")
    titleIndex = accessor.getIndex(TitleIndex)
    for docId in docIds:
        print(titleIndex.getTitleById(docId))
    doc_id = titleIndex.getIdByTitle("ALCAM")
    print(docTypesIndex.getDocTypeById(doc_id))
#hb = HeadersDBBuilder(accessor,list(docIds))
#hb.build()
#hb.preProcess()
#hb.processDocument(doc_id)
#hi = HeadersDBIndex(accessor)
#hi.getCountHeadersForDoc(docIds)
#stat = hi.getAllStat(docIds)
#for s in stat:
Exemplo n.º 8
0
            y=Y_sklearn[:,1],
            mode='markers',
            marker = go.Marker(
                size=12,
                line= go.Line(
                    color='rgba(217, 217, 217, 0.14)',
                    width=0.5),
                opacity=0.8))
        traces.append(trace)
        
        
        data = go.Data(traces)
        layout = go.Layout(xaxis = go.XAxis(title='PC1', showline=False),
                           yaxis = go.YAxis(title='PC2', showline=False))
        fig = go.Figure(data=data, layout=layout)
        if self.outputType=='file':
            print(py.plot(fig, filename='pca.html'))
        else:
            return py.plot(fig, output_type='div')


     
if __name__ =="__main__":
    from pywikiaccessor.wiki_accessor import WikiAccessor
    directory = "C:\\WORK\\science\\onpositive_data\\python\\"
    accessor = WikiAccessor(directory)
    TextVisualizator(accessor,'miph_',output_type="div").getStat()
    #PCAWikiVisualizator(accessor,'miph_').getHists("UNIQUE_VERBS")
    #PCAWikiVisualizator(accessor,'miph_').getHists("VERBS")
    #PCAWikiVisualizator(accessor,'miph_').getHists("UNIQUE_NOUNS")
    #PCAWikiVisualizator(accessor,'miph_').getRelativeHists("UNIQUE_VERBS")
Exemplo n.º 9
0
def buildPOSList(prefix):
    directory = "C:\\[Study]\\Diploma\\wiki_indexes\\"
    accessor = WikiAccessor(directory)
    sb = POSListBuilder(accessor, prefix)
    sb.build()
    sb.printTfIdf()
Exemplo n.º 10
0
def buildStat(prefix):
    directory = "C:\\[Study]\\Diploma\\wiki_indexes\\"
    accessor = WikiAccessor(directory)
    sb = StatBuilder(accessor, prefix)
    sb.build()
    sb.print()
Exemplo n.º 11
0
def buildFragments(prefix):
    directory = "C:\\[Study]\\Diploma\\wiki_indexes\\"
    accessor = WikiAccessor(directory)
    fb = СollocationBuilder(accessor, prefix)
    fb.build()
    fb.printFragments(True)
Exemplo n.º 12
0
def buildPOSList(prefix):
    directory = "C:\\WORK\\science\\onpositive_data\\python\\"
    accessor = WikiAccessor(directory)
    sb = POSListBuilder(accessor, prefix)
    sb.build()
    sb.printTfIdf()
Exemplo n.º 13
0
def buildFragments(prefix):
    directory = "C:\\WORK\\science\\onpositive_data\\python\\"
    accessor = WikiAccessor(directory)
    fb = СollocationBuilder(accessor, prefix)
    fb.build()
    fb.printFragments(True)