def trans(src, dst, index, label_index, mode='w', sep='/'): lid = indexer.Indexer(label_index, mode) inder = indexer.Indexer(index, mode) #lid=indexer.Indexer(label_index,'r') #print(lid,mode) file = open(dst, 'wb') ln = 0 #print(src) for line in open(src, encoding='utf8'): ln += 1 #print(line) wts = [x.rpartition(sep) for x in line.strip().split(' ')] if sep == ' ': tags = ['' for x in wts] line = [x[-1] for x in wts] else: tags = [x[-1] for x in wts] line = [x[0] for x in wts] #print(len(wts)) #input() seq = ''.join(line) #if (mode=='r'): #print(seq) #input() graph = [] fs = [ filter(lambda x: x >= 0, [inder(k) for k in gen_keys(seq, x)]) for x in range(len(seq)) ] for c, v in zip(_to_tags(tags, line, lid), fs): graph.append([0, [], c, v]) if not graph: continue graph[0][0] += 1 graph[-1][0] += 2 for i in range(1, len(graph)): graph[i][1] = [i - 1] json_to_binary.graph_to_file(graph, file) if ln % 1000 == 0: print(ln) #if ln>5000:break file.close() print(len(inder)) print('the end')
def trans(src,dst,index,label_index,mode='w',sep='/'): lid=indexer.Indexer(label_index,mode) inder=indexer.Indexer(index,mode) #lid=indexer.Indexer(label_index,'r') #print(lid,mode) file=open(dst,'wb') ln=0 #print(src) for line in open(src,encoding='utf8'): ln+=1 #print(line) wts=[x.rpartition(sep) for x in line.strip().split(' ')] if sep==' ': tags=['' for x in wts] line=[x[-1] for x in wts] else: tags=[x[-1] for x in wts] line=[x[0] for x in wts] #print(len(wts)) #input() seq=''.join(line) #if (mode=='r'): #print(seq) #input() graph=[] fs=[filter(lambda x:x>=0,[inder(k) for k in gen_keys(seq,x)]) for x in range(len(seq))] for c,v in zip(_to_tags(tags,line,lid),fs): graph.append([0,[],c,v]) if not graph:continue graph[0][0]+=1; graph[-1][0]+=2; for i in range(1,len(graph)): graph[i][1]=[i-1] json_to_binary.graph_to_file(graph,file) if ln%1000==0: print(ln) #if ln>5000:break file.close() print(len(inder)) print('the end')
def trans(src, dst, index, label_index, mode="w", sep="/", dictionary=None): lid = indexer.Indexer(label_index, mode) inder = indexer.Indexer(index, mode) if dictionary: dict_feature = DictFeature(dictionary) file = open(dst, "wb") ln = 0 for line in open(src, encoding="utf8"): ln += 1 wts = [x.rpartition(sep) for x in line.strip().split(" ")] if sep == " ": tags = ["" for x in wts] line = [x[-1] for x in wts] else: tags = [x[-1] for x in wts] line = [x[0] for x in wts] seq = "".join(line) graph = [] fs = [[inder(k) for k in gen_keys(seq, x)] for x in range(len(seq))] if dictionary: dict_feature(seq, inder, fs) fs = [list(filter(lambda x: x >= 0, fv)) for fv in fs] # print(fs) # input() for c, v in zip(_to_tags(tags, line, lid), fs): graph.append([0, [], c, v]) if not graph: continue graph[0][0] += 1 graph[-1][0] += 2 for i in range(1, len(graph)): graph[i][1] = [i - 1] json_to_binary.graph_to_file(graph, file) if ln % 1000 == 0: print(ln) # if ln>5000:break file.close() print(len(inder)) print("the end")
def test(index,src,dst): inder=indexer.Indexer(index,'r') file=open(dst,'wb') for line in open(src,encoding='utf8'): line=line.split() seq=''.join(line) graph=[] fs=[filter(lambda x:x>=0,[inder(k) for k in gen_keys(seq,x)]) for x in range(len(seq))] for c,v in zip(_to_tags(line),fs): graph.append([0,[],c,v]) if not graph:continue graph[0][0]+=1; graph[-1][0]+=2; for i in range(1,len(graph)): graph[i][1]=[i-1] json_to_binary.graph_to_file(graph,file) print('the end') file.close()
def trans(src, dst, index, label_index, mode='w', sep='/', dictionary=None): lid = indexer.Indexer(label_index, mode) inder = indexer.Indexer(index, mode) if dictionary: dict_feature = DictFeature(dictionary) file = open(dst, 'wb') ln = 0 for line in open(src, encoding='utf8'): ln += 1 wts = [x.rpartition(sep) for x in line.strip().split(' ')] if sep == ' ': tags = ['' for x in wts] line = [x[-1] for x in wts] else: tags = [x[-1] for x in wts] line = [x[0] for x in wts] seq = ''.join(line) graph = [] fs = [[inder(k) for k in gen_keys(seq, x)] for x in range(len(seq))] if dictionary: dict_feature(seq, inder, fs) fs = [list(filter(lambda x: x >= 0, fv)) for fv in fs] #print(fs) #input() for c, v in zip(_to_tags(tags, line, lid), fs): graph.append([0, [], c, v]) if not graph: continue graph[0][0] += 1 graph[-1][0] += 2 for i in range(1, len(graph)): graph[i][1] = [i - 1] json_to_binary.graph_to_file(graph, file) if ln % 1000 == 0: print(ln) #if ln>5000:break file.close() print(len(inder)) print('the end')