예제 #1
0
print("Writing data...")
if __name__ == "__main__":
    data = map(preprocess,data_from_many(datafiles))
    #print("data generator created")    
    i = 0
    j = 0
    k = 0
    filename = DESTFILE.split('.')[0] + str(j) + '.json'
    file = open(filename,'w')
    #print("file created %s" % filename) 
    for label, paragraphs in data:
        k+=1
        #print('%i' % k)
        #print("label, paragraph extracted",end = '\r')
        if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
            text_chunks = bunch_paragraphs(paragraphs,target_length=250)
            labels = [label for i in range(len(text_chunks))]
            items = zip(labels,text_chunks)
            for item in items:
                if len(item[1].split(' ')) >= 100:
                    i+=1
                    print("%i datapoints written... %s" % (i,item[0]) + 50*' ',end='\n')
                    file.write(json.dumps(item)+'\n')
                    if i % MAX_PER_FILE == 0:
                        j+=1
                        filename = DESTFILE.split('.')[0]+str(j) + '.json'
                        file.close()
                        file = open(filename,'w')
    file.close()
    print()
    print("%s processes. %i lines written to %s." % (DATAPATH,i,DESTFILE)) 
예제 #2
0
    data = map(preprocess,data_from_many(datafiles))
    #print("data generator created")    
    i = 0
    j = 0
    k = 0
    filename = DESTFILE.split('.')[0] + str(j) + '.json'
    file = open(filename,'w')
    #print("file created %s" % filename) 
    for label, paragraphs in data:
        k+=1
        #print('%i' % k)
        #print("label, paragraph extracted",end = '\r')
        if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
            try:
                text_chunks = bunch_paragraphs(filter_paragraphs(FN,paragraphs),target_length=250)
                labels = [label for i in range(len(text_chunks))]
                items = zip(labels,text_chunks)
                for item in items:
                    if len(item[1].split(' ')) >= 100:
                        i+=1
                        print("%i datapoints written... %s\t%s" % (i,item[0],FN) + 50*' ',end='\n')
                        file.write(json.dumps(item)+'\n')
                        if i % MAX_PER_FILE == 0:
                            j+=1
                            filename = DESTFILE.split('.')[0]+str(j) + '.json'
                            file.close()
                            file = open(filename,'w')
            except:
                pass
    file.close()