예제 #1
0
def preRead():
    global id2ch, ch2id, id2tg, vocabulary_size
    id2ch = ljqpy.LoadList('id2ch_w2v.txt')
    ch2id = {v: k for k, v in enumerate(id2ch)}
    id2tg = []
    vocabulary_size = len(id2ch)
    print('vocabulary: %d' % (vocabulary_size))
예제 #2
0
def preRead():
    global id2ch, ch2id, id2tg, vocabulary_size
    id2ch = ljqpy.LoadList(data_path + 'id2ch.txt')
    ch2id = {v: k for k, v in enumerate(id2ch)}
    id2tg = []
    vocabulary_size = len(id2ch)
    print(vocabulary_size)
def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None):
    if dict_file is not None and os.path.exists(dict_file):
        print('loading', dict_file)
        lst = ljqpy.LoadList(dict_file)
        midpos = lst.index('<@@@>')
        itokens = TokenList(lst[:midpos])
        otokens = TokenList(lst[midpos + 1:])
        return itokens, otokens
    data = ljqpy.LoadCSV(fn)
    wdicts = [{}, {}]
    for ss in data:
        for seq, wd in zip(ss, wdicts):
            for w in seq.split(delimiter):
                wd[w] = wd.get(w, 0) + 1
    wlists = []
    for wd in wdicts:
        wd = ljqpy.FreqDict2List(wd)
        wlist = [x for x, y in wd if y >= min_freq]
        wlists.append(wlist)
    print('seq 1 words:', len(wlists[0]))
    print('seq 2 words:', len(wlists[1]))
    itokens = TokenList(wlists[0])
    otokens = TokenList(wlists[1])
    if dict_file is not None:
        ljqpy.SaveList(wlists[0] + ['<@@@>'] + wlists[1], dict_file)
    return itokens, otokens
예제 #4
0
def TextRank():
    for sen in ljqpy.LoadList('training/merged_text.txt'):
        if not ',' in sen: continue
        print(sen)
        for x, w in jieba.analyse.textrank(sen, withWeight=True,
                                           allowPOS=None):
            print('%s %s' % (x, w))
        print('-' * 30)
예제 #5
0
def GetEdgeFromCoocc():
    global datalist, datadict, idf, tags, r1cnt
    datalist = []
    datadict = {}
    df = defaultdict(int)
    for jj in ljqpy.LoadList('training/all_data.txt'):
        jj = json.loads(jj)
        datadict[jj['id']] = jj['text']
        tf = GetTags(jj['text'])
        for t in tf.keys():
            df[t] += 1
        jj['tf'] = tf
        datalist.append(jj)
    N = len(datalist)
    idf = {x: math.log(N / s) for x, s in df.items()}
    #ljqpy.SaveCSV(ljqpy.FreqDict2List(idf), 'saved_graph/idf.txt')
    tags = {x for x, s in df.items() if s > 2 and idf[x] > 2 and len(x) > 1}
    tags = {x for x in tags if not x.isdigit()}

    print('docu segs:', N)
    print('tags:', len(tags))
    lasttts = []

    r2cnt = defaultdict(int)
    r1cnt = defaultdict(int)

    for i, jj in enumerate(datalist):
        id, words = jj['id'], jj['tf']
        tt = [x for x in words.keys() if x in tags]
        if i % 1000 == 0: print('datalist %d/%d' % (i, len(datalist)))

        for mi in range(3):
            if mi >= i: continue
            lid, lasttt = (id, tt) if i == 0 else lasttts[-mi]
            if lid.split('@')[0] != id.split('@')[0]: break

            for w1 in tt:
                for w2 in lasttt:
                    if w1 in w2 or w2 in w1: continue
                    if w2 < w1: w1, w2 = w2, w1
                    r2cnt[(w1, w2)] += 1
                    r1cnt[w1] += 1
                    r1cnt[w2] += 1

        lasttts.append((id, tt))
        if len(lasttts) > 10: lasttts = lasttts[5:]

    relscs = {}
    for g2, ng2 in ljqpy.FreqDict2List(r2cnt):
        for i, w in enumerate(g2):
            relscs[(w, g2[1 - i])] = ng2 / r1cnt[w]
        #print(g2, ng2, ng2/r1cnt[g2[0]], ng2/r1cnt[g2[1]])
        if ng2 < 100: break

    with open('gen_rels/edges_coocc.txt', 'w', encoding='utf-8') as fout:
        for g2, rel in ljqpy.FreqDict2List(relscs):
            if rel < 0.2: break
            ljqpy.WriteLine(fout, ['coocc', g2[0], g2[1], rel])
예제 #6
0
	def __init__(self,min_limit,stopword=''):
		self.min_limit=min_limit
		if type(stopword) is type([]):
			self.stopword = stopword
		elif type(stopword) is type(''):
			self.stopword = [' ','\r\n','\r','\n','\t','\u3000']
			if os.path.exists(stopword):
				self.stopword += ljqpy.LoadList(stopword)
			else:
				print('stopword path not exists')

		self.fragment_count=0
		self.documents=[]
		self.label={}
		self.article=[]
		self.keyword_docs=[]
		self.collect_arw_article()
예제 #7
0
def GetEdgeFromCNDB():
    nodes = ljqpy.LoadList('saved_graph/graph_nodes.txt')
    node_set = set(nodes)
    step = 100
    ems = {}
    with open('gen_rels/edges_kg.txt', 'w', encoding='utf-8') as fout:
        for ii in range(0, len(nodes), step):
            print('%d/%d' % (ii, len(nodes)))
            nslice = nodes[ii:ii + step]
            m2e = api.Ment2Ent(nslice)
            mes = {}
            for mm in nslice:
                ees = m2e.get(mm, [])
                if len(ees) == 0: continue
                ee = ees[0]
                if '歌' in ee or '影' in ee: continue
                mes[mm] = ee
                ems.setdefault(ee, []).append(mm)
            tris = api.GetEntTriplesMulti(list(mes.values()),
                                          keephref=True,
                                          nospecial=0)
            edges = []
            for mm, ee in mes.items():
                tri = tris.get(ee, [])
                ww = 0.5 + 0.5 / len(m2e.get(mm, []))
                for p, o in tri:
                    olinks = re.findall('<a.+?>(.+?)</a>', o)
                    for olink in olinks:
                        if olink == mm: continue
                        if olink in node_set:
                            edges.append(('KG', mm, olink, ww))
                            edges.append(('KGi', olink, mm, ww))
            for x in edges:
                ljqpy.WriteLine(fout, x)
        for ee, mms in ems.items():
            for i, m1 in enumerate(mms):
                for m2 in mms[:i]:
                    ljqpy.WriteLine(fout, ['KGm', m1, m2, 1])
                    ljqpy.WriteLine(fout, ['KGm', m2, m1, 1])
예제 #8
0
def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None):
	'''
	构建input和output sequence的 word或char list
	:param fn: 
	:param min_freq: 
	:param delimiter: 
	:param dict_file: 
	:return: 
	'''
	# 如果有word/char list则不需要重新构建
	if dict_file is not None and os.path.exists(dict_file):
		print('loading', dict_file)
		lst = ljqpy.LoadList(dict_file)
		midpos = lst.index('<@@@>')
		itokens = TokenList(lst[:midpos])
		otokens = TokenList(lst[midpos+1:])
		return itokens, otokens
	# 如果没有则重新构建
	data = ljqpy.LoadCSV(fn)
	wdicts = [{}, {}]
	for ss in data:
		for seq, wd in zip(ss, wdicts):
			for w in seq.split(delimiter): 
				wd[w] = wd.get(w, 0) + 1  # nice code
	wlists = []
	for wd in wdicts:	
		wd = ljqpy.FreqDict2List(wd)
		wlist = [x for x,y in wd if y >= min_freq]
		wlists.append(wlist)
	print('seq 1 words:', len(wlists[0]))
	print('seq 2 words:', len(wlists[1]))
	itokens = TokenList(wlists[0])
	otokens = TokenList(wlists[1])
	if dict_file is not None:
		ljqpy.SaveList(wlists[0]+['<@@@>']+wlists[1], dict_file)
	return itokens, otokens
예제 #9
0
파일: 19.py 프로젝트: qifanyyy/CLCDSA
def Run():
    global N, lst
    N = int(lst[0])
    mat = lst[1:N + 1]
    lst = lst[N + 1:]
    mns = N * N
    mvv = sum([
        sum([int(mat[x][y]) << (x * N + y) for y in range(N)])
        for x in range(N)
    ])
    ret = mns
    for v in range(2**mns):
        if v & mvv: continue
        nn = GetNum(v)
        if nn >= ret: continue
        if Check(mvv + v): ret = nn
    return '%d' % ret


lst = ljqpy.LoadList('D-small-attempt0.in')
outf = 'D-small-attempt0.out'

with open(outf, 'w') as fout:
    N = int(lst[0])
    lst = lst[1:]
    for k in range(N):
        fout.write('Case #%d: %s\n' % (1 + k, Run()))
        fout.flush()

os.system('emeditor.exe ' + outf)
print('completed')
예제 #10
0
            z = q[qh]
            qh += 1
            for i in edge.get(z, []):
                if mm[i] != -1: continue
                mm[i] = mm[z] + 1
                q.append(i)
        return max(mm)

    #print(ls)
    rr = len(ls)
    for v in ls:
        rr += FindL(v)
    return max([rt, rr])


lst = ljqpy.LoadList('C-large.in')
outf = 'C-large.out'

with open(outf, 'w') as fout:
    ii = 0
    T = int(lst[ii])
    ii += 1
    for k in range(T):
        n = int(lst[ii])
        ii += 1
        inp = tuple(map(int, lst[ii].split()))
        ii += 1
        fout.write('Case #%d: %d\n' % (1 + k, Run(n, inp)))

os.system('emeditor.exe ' + outf)
print('completed')
예제 #11
0
def MakeMerged():
    txts = []
    for xx in ljqpy.LoadList('training/all_data.txt'):
        xx = json.loads(xx)
        txts.append(xx['text'])
    ljqpy.SaveList(txts, 'training/merged_text.txt')
예제 #12
0
    random.seed(1333)
    st = set()
    while j > 0:
        v = [1] + [random.randint(0, 1) for x in range(n - 2)] + [1]
        zz = ''.join(str(x) for x in v)
        if zz in st: continue
        st.add(zz)
        rt = []
        for b in range(2, 11):
            z = 0
            for c in v:
                z = z * b + c
            rt.append(CheckNotPrime(z))
            if rt[-1] == 0: break
        if rt[-1] != 0:
            print(zz, rt)
            fout.write(zz + ' ')
            fout.write(' '.join(str(x) for x in rt))
            fout.write('\n')
            j -= 1


lst = ljqpy.LoadList('input.txt')
outf = 'C-large.out'

with open(outf, 'w') as fout:
    fout.write('Case #1:\n')
    Run(32, 500, fout)

os.system(outf)
print('completed')
예제 #13
0
    for i in range(1, k + 1):
        for j in range(0, i + 1):
            if j > 0:
                f[i][j] += f[i - 1][j - 1] * pps[i - 1]
            f[i][j] += f[i - 1][j] * (1 - pps[i - 1])
    return f[k][k // 2]


def Run(p1, p2):
    n, k = map(int, p1.split())
    plst = list(map(float, p2.split()))
    ret = 0.0
    for v in range(2**n):
        pps = [plst[u] for u in range(n) if (v & (1 << u)) != 0]
        if len(pps) != k: continue
        ret = max(ret, Compute(pps))
    return '%.8f' % ret


lst = ljqpy.LoadList('B-small-attempt2.in')
outf = 'B-small-attempt2.out'

with open(outf, 'w') as fout:
    N = int(lst[0])
    for k in range(N):
        fout.write('Case #%d: %s\n' %
                   (1 + k, Run(lst[k * 2 + 1], lst[k * 2 + 2])))
        fout.flush()

os.system('emeditor.exe ' + outf)
print('completed')