示例#1
0
文件: sepgen.py 项目: takada-at/sep
def cooccurence():
    logger.info('vocabrary load...')
    vocab = nltkwrapper.Vocab(context)
    ranking = vocab.vocab(300)
    stems = vocab.stemmer.getdict().items()
    logger.info('load words coocurence...')
    cooc = vocab.cooccurence(dict(ranking).keys()).items()
    cooc.sort(lambda x,y:cmp(y[1],x[1]))
    cooc = [(k0,k1,c) for ((k0,k1),c) in cooc]
    logger.info('save...')
    savecsv(os.path.join(context.dbdir(), 'ranking.csv'), ranking)
    savecsv(os.path.join(context.dbdir(), 'cooccurence.csv'), cooc)
    savecsv(os.path.join(context.dbdir(), 'stem.csv'), stems)
示例#2
0
def cooccurence():
    logger.info('vocabrary load...')
    vocab = nltkwrapper.Vocab(context)
    ranking = vocab.vocab(300)
    stems = vocab.stemmer.getdict().items()
    logger.info('load words coocurence...')
    cooc = vocab.cooccurence(dict(ranking).keys()).items()
    cooc.sort(lambda x, y: cmp(y[1], x[1]))
    cooc = [(k0, k1, c) for ((k0, k1), c) in cooc]
    logger.info('save...')
    savecsv(os.path.join(context.dbdir(), 'ranking.csv'), ranking)
    savecsv(os.path.join(context.dbdir(), 'cooccurence.csv'), cooc)
    savecsv(os.path.join(context.dbdir(), 'stem.csv'), stems)
示例#3
0
文件: ranking.py 项目: takada-at/sep
def saveranking(ranking):
    dbdir = context.dbdir()
    with io.open(os.path.join(dbdir, 'ranking.csv'), 'w') as wio:
        for word, freq in ranking:
            wio.write(u"\t".join([word, unicode(freq)]) + u"\n")

    return ranking
示例#4
0
def bibliocount():
    counter = biblio.count()
    items = counter.items()
    items.sort(lambda x, y: cmp(y[1], x[1]))
    items = [[author, year, title, count]
             for ((author, year, title), count) in items]
    savecsv(os.path.join(context.dbdir(), 'biblio.csv'), items)
示例#5
0
def saveranking(ranking):
    dbdir = context.dbdir()
    with io.open(os.path.join(dbdir, 'ranking.csv'), 'w') as wio:
        for word, freq in ranking:
            wio.write(u"\t".join([word, unicode(freq)]) + u"\n")

    return ranking
示例#6
0
文件: ranking.py 项目: takada-at/sep
def loadranking():
    dirname = context.dbdir()
    ranking = []
    with io.open(os.path.join(dirname, 'ranking.csv')) as fio:
        for line in fio:
            word,count = line.rstrip().split("\t")
            ranking.append((word, int(count)))

    return ranking
示例#7
0
def loadranking():
    dirname = context.dbdir()
    ranking = []
    with io.open(os.path.join(dirname, 'ranking.csv')) as fio:
        for line in fio:
            word, count = line.rstrip().split("\t")
            ranking.append((word, int(count)))

    return ranking
示例#8
0
def preparedir():
    dirs = [
        context.datadir(),
        context.graphdir(),
        context.dbdir(),
        context.textdatadir()
    ]
    for dirname in dirs:
        if not os.path.exists(dirname):
            os.mkdir(dirname)
示例#9
0
文件: ranking.py 项目: takada-at/sep
def loadorgword():
    dirname = context.dbdir()
    dic = dict()
    with io.open(os.path.join(dirname, 'stem.csv')) as fio:
        for line in fio:
            stem,org = line.rstrip().split("\t")
            dic[stem] = org

    stemmer = nltkwrapper.Stem()
    stemmer.loaddata(dic)
    return stemmer
示例#10
0
def loadorgword():
    dirname = context.dbdir()
    dic = dict()
    with io.open(os.path.join(dirname, 'stem.csv')) as fio:
        for line in fio:
            stem, org = line.rstrip().split("\t")
            dic[stem] = org

    stemmer = nltkwrapper.Stem()
    stemmer.loaddata(dic)
    return stemmer
示例#11
0
文件: graph.py 项目: takada-at/sep
def load(limit=1300):
    stemmer = ranking.loadorgword()
    ranks = ranking.loadtaggedranking(stemmer)
    ranks = [(w, cnt) for w,tag,cnt in ranks]
    rankdict = dict(ranks)
    words = set(w[0] for w in ranks)
    dbdir = context.dbdir()
    edges = []
    with io.open(os.path.join(dbdir,'cooccurence.csv')) as fio:
        for line in fio:
            word0,word1,count = line.rstrip().split("\t")
            word0 = stemmer.orgword(word0)
            word1 = stemmer.orgword(word1)
            if word0 in words and word1 in words:
                edges.append((word0,word1,int(count)))

    edges.sort(lambda x,y:cmp(y[2],x[2]))
    edges = [(w0,w1,k) for (w0,w1,k) in edges if k>limit]
    return (edges, rankdict)
示例#12
0
def load(limit=1300):
    stemmer = ranking.loadorgword()
    ranks = ranking.loadtaggedranking(stemmer)
    ranks = [(w, cnt) for w, tag, cnt in ranks]
    rankdict = dict(ranks)
    words = set(w[0] for w in ranks)
    dbdir = context.dbdir()
    edges = []
    with io.open(os.path.join(dbdir, 'cooccurence.csv')) as fio:
        for line in fio:
            word0, word1, count = line.rstrip().split("\t")
            word0 = stemmer.orgword(word0)
            word1 = stemmer.orgword(word1)
            if word0 in words and word1 in words:
                edges.append((word0, word1, int(count)))

    edges.sort(lambda x, y: cmp(y[2], x[2]))
    edges = [(w0, w1, k) for (w0, w1, k) in edges if k > limit]
    return (edges, rankdict)
示例#13
0
文件: sepgen.py 项目: takada-at/sep
def ranking():
    vocab = nltkwrapper.Vocab(context)
    ranking = vocab.vocab(300)
    stems = vocab.stemmer.getdict().items()
    savecsv(os.path.join(context.dbdir(), 'ranking.csv'), ranking)
    savecsv(os.path.join(context.dbdir(), 'stem.csv'), stems)
示例#14
0
def graphdata():
    data = graph.load()
    savecsv(os.path.join(context.dbdir(), 'graphdata.csv'), data)
示例#15
0
文件: sepgen.py 项目: takada-at/sep
def nnranking():
    ranks = rankmod.loadnnranking()
    savecsv(os.path.join(context.dbdir(), 'nnranking.csv'), ranks)
示例#16
0
文件: sepgen.py 项目: takada-at/sep
def taggedranking():
    ranks = rankmod.loadtaggedranking()
    savecsv(os.path.join(context.dbdir(), 'taggedranking.csv'), ranks)
示例#17
0
文件: sepgen.py 项目: takada-at/sep
def graphdata():
    data = graph.load()
    savecsv(os.path.join(context.dbdir(), 'graphdata.csv'), data)
示例#18
0
文件: sepgen.py 项目: takada-at/sep
def bibliocount():
    counter = biblio.count()
    items = counter.items()
    items.sort(lambda x,y: cmp(y[1],x[1]))
    items = [[author, year, title, count] for ((author, year, title), count) in items]
    savecsv(os.path.join(context.dbdir(), 'biblio.csv'), items)
示例#19
0
文件: sepgen.py 项目: takada-at/sep
def preparedir():
    dirs = [context.datadir(), context.graphdir(), context.dbdir(), context.textdatadir()]
    for dirname in dirs:
        if not os.path.exists(dirname):
            os.mkdir(dirname)
示例#20
0
def taggedranking():
    ranks = rankmod.loadtaggedranking()
    savecsv(os.path.join(context.dbdir(), 'taggedranking.csv'), ranks)
示例#21
0
def nnranking():
    ranks = rankmod.loadnnranking()
    savecsv(os.path.join(context.dbdir(), 'nnranking.csv'), ranks)
示例#22
0
def ranking():
    vocab = nltkwrapper.Vocab(context)
    ranking = vocab.vocab(300)
    stems = vocab.stemmer.getdict().items()
    savecsv(os.path.join(context.dbdir(), 'ranking.csv'), ranking)
    savecsv(os.path.join(context.dbdir(), 'stem.csv'), stems)