def cooccurence(): logger.info('vocabrary load...') vocab = nltkwrapper.Vocab(context) ranking = vocab.vocab(300) stems = vocab.stemmer.getdict().items() logger.info('load words coocurence...') cooc = vocab.cooccurence(dict(ranking).keys()).items() cooc.sort(lambda x,y:cmp(y[1],x[1])) cooc = [(k0,k1,c) for ((k0,k1),c) in cooc] logger.info('save...') savecsv(os.path.join(context.dbdir(), 'ranking.csv'), ranking) savecsv(os.path.join(context.dbdir(), 'cooccurence.csv'), cooc) savecsv(os.path.join(context.dbdir(), 'stem.csv'), stems)
def cooccurence(): logger.info('vocabrary load...') vocab = nltkwrapper.Vocab(context) ranking = vocab.vocab(300) stems = vocab.stemmer.getdict().items() logger.info('load words coocurence...') cooc = vocab.cooccurence(dict(ranking).keys()).items() cooc.sort(lambda x, y: cmp(y[1], x[1])) cooc = [(k0, k1, c) for ((k0, k1), c) in cooc] logger.info('save...') savecsv(os.path.join(context.dbdir(), 'ranking.csv'), ranking) savecsv(os.path.join(context.dbdir(), 'cooccurence.csv'), cooc) savecsv(os.path.join(context.dbdir(), 'stem.csv'), stems)
def saveranking(ranking): dbdir = context.dbdir() with io.open(os.path.join(dbdir, 'ranking.csv'), 'w') as wio: for word, freq in ranking: wio.write(u"\t".join([word, unicode(freq)]) + u"\n") return ranking
def bibliocount(): counter = biblio.count() items = counter.items() items.sort(lambda x, y: cmp(y[1], x[1])) items = [[author, year, title, count] for ((author, year, title), count) in items] savecsv(os.path.join(context.dbdir(), 'biblio.csv'), items)
def loadranking(): dirname = context.dbdir() ranking = [] with io.open(os.path.join(dirname, 'ranking.csv')) as fio: for line in fio: word,count = line.rstrip().split("\t") ranking.append((word, int(count))) return ranking
def loadranking(): dirname = context.dbdir() ranking = [] with io.open(os.path.join(dirname, 'ranking.csv')) as fio: for line in fio: word, count = line.rstrip().split("\t") ranking.append((word, int(count))) return ranking
def preparedir(): dirs = [ context.datadir(), context.graphdir(), context.dbdir(), context.textdatadir() ] for dirname in dirs: if not os.path.exists(dirname): os.mkdir(dirname)
def loadorgword(): dirname = context.dbdir() dic = dict() with io.open(os.path.join(dirname, 'stem.csv')) as fio: for line in fio: stem,org = line.rstrip().split("\t") dic[stem] = org stemmer = nltkwrapper.Stem() stemmer.loaddata(dic) return stemmer
def loadorgword(): dirname = context.dbdir() dic = dict() with io.open(os.path.join(dirname, 'stem.csv')) as fio: for line in fio: stem, org = line.rstrip().split("\t") dic[stem] = org stemmer = nltkwrapper.Stem() stemmer.loaddata(dic) return stemmer
def load(limit=1300): stemmer = ranking.loadorgword() ranks = ranking.loadtaggedranking(stemmer) ranks = [(w, cnt) for w,tag,cnt in ranks] rankdict = dict(ranks) words = set(w[0] for w in ranks) dbdir = context.dbdir() edges = [] with io.open(os.path.join(dbdir,'cooccurence.csv')) as fio: for line in fio: word0,word1,count = line.rstrip().split("\t") word0 = stemmer.orgword(word0) word1 = stemmer.orgword(word1) if word0 in words and word1 in words: edges.append((word0,word1,int(count))) edges.sort(lambda x,y:cmp(y[2],x[2])) edges = [(w0,w1,k) for (w0,w1,k) in edges if k>limit] return (edges, rankdict)
def load(limit=1300): stemmer = ranking.loadorgword() ranks = ranking.loadtaggedranking(stemmer) ranks = [(w, cnt) for w, tag, cnt in ranks] rankdict = dict(ranks) words = set(w[0] for w in ranks) dbdir = context.dbdir() edges = [] with io.open(os.path.join(dbdir, 'cooccurence.csv')) as fio: for line in fio: word0, word1, count = line.rstrip().split("\t") word0 = stemmer.orgword(word0) word1 = stemmer.orgword(word1) if word0 in words and word1 in words: edges.append((word0, word1, int(count))) edges.sort(lambda x, y: cmp(y[2], x[2])) edges = [(w0, w1, k) for (w0, w1, k) in edges if k > limit] return (edges, rankdict)
def ranking(): vocab = nltkwrapper.Vocab(context) ranking = vocab.vocab(300) stems = vocab.stemmer.getdict().items() savecsv(os.path.join(context.dbdir(), 'ranking.csv'), ranking) savecsv(os.path.join(context.dbdir(), 'stem.csv'), stems)
def graphdata(): data = graph.load() savecsv(os.path.join(context.dbdir(), 'graphdata.csv'), data)
def nnranking(): ranks = rankmod.loadnnranking() savecsv(os.path.join(context.dbdir(), 'nnranking.csv'), ranks)
def taggedranking(): ranks = rankmod.loadtaggedranking() savecsv(os.path.join(context.dbdir(), 'taggedranking.csv'), ranks)
def bibliocount(): counter = biblio.count() items = counter.items() items.sort(lambda x,y: cmp(y[1],x[1])) items = [[author, year, title, count] for ((author, year, title), count) in items] savecsv(os.path.join(context.dbdir(), 'biblio.csv'), items)
def preparedir(): dirs = [context.datadir(), context.graphdir(), context.dbdir(), context.textdatadir()] for dirname in dirs: if not os.path.exists(dirname): os.mkdir(dirname)