def getVocabFreqDict(filenames): vocabDict = {} for i in range(len(filenames)): f = filenames[i].strip() html = readTextFromFile(f) text = getTextFromHTML(html) if (len(text) == 0): continue countVectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 2)) termFreqMat = countVectorizer.fit_transform([text]) for term in list(countVectorizer.vocabulary_.keys()): vocabDict.setdefault(term, 0) vocabDict[term] += 1 if (i % 100 == 0): print(i, 'of', len(filenames)) dumpJsonToFile('1-2-gram.json', vocabDict)
def getStemclasses(): stemClasses = {} vocabDict = getDictFromFile('wiki-small-vocab.json') counter = 0 for voc, vocDict in vocabDict.items(): stem = PorterStemmer.useStemer(voc) stemClasses.setdefault(stem, []) stemClasses[stem].append(voc) if (counter % 10000 == 0): print('\t', counter, voc) counter += 1 dumpJsonToFile('wiki-small-vocab-stem-classes.json', stemClasses, False)
def getTopKPages(pathnames, filenames): if( len(pathnames) == 0 ): return [] outlinksDict = {} for i in range(len(pathnames)): wiki = pathnames[i] wiki = wiki.strip() html = readTextFromFile(wiki) if( i % 100 == 0 ): print(i, 'of', len(pathnames), 'wiki file:', wiki) print('\tlen:', len(outlinksDict)) sourcewiki = getHTMLFilename(wiki) getWikiOutlinks(sourcewiki, html, outlinksDict) #if( i == 3 ): # break dumpJsonToFile('./outlinksDict.json', outlinksDict)
vocabDict.setdefault(term, {'f': []}) vocabDict[term]['f'].append(f) if (i % 100 == 0): print(i, 'of', len(filenames)) if (i > stop): break return vocabDict stop = 500 filenames = getHTMLPaths() vocabDict = getVocabFreqDict(filenames, stop) dumpJsonToFile('wiki-small-vocab-' + str(stop) + '.json', vocabDict, False) word = 'hospital' N = 6042 k = 20 getAssocMeasuresDocs(word, N, k) ''' #command line: python A3.P1.py if( len(sys.argv) > 1 ): filename = 'wiki-small-vocab.json' word = sys.argv[1] N = 15103 k = 20 getAssocMeasuresWindow(word, N, filename, k) '''