print("Writing data...") if __name__ == "__main__": data = map(preprocess,data_from_many(datafiles)) #print("data generator created") i = 0 j = 0 k = 0 filename = DESTFILE.split('.')[0] + str(j) + '.json' file = open(filename,'w') #print("file created %s" % filename) for label, paragraphs in data: k+=1 #print('%i' % k) #print("label, paragraph extracted",end = '\r') if label != 'NOLABEL' and paragraphs != 'NOCONTENT': text_chunks = bunch_paragraphs(paragraphs,target_length=250) labels = [label for i in range(len(text_chunks))] items = zip(labels,text_chunks) for item in items: if len(item[1].split(' ')) >= 100: i+=1 print("%i datapoints written... %s" % (i,item[0]) + 50*' ',end='\n') file.write(json.dumps(item)+'\n') if i % MAX_PER_FILE == 0: j+=1 filename = DESTFILE.split('.')[0]+str(j) + '.json' file.close() file = open(filename,'w') file.close() print() print("%s processes. %i lines written to %s." % (DATAPATH,i,DESTFILE))
data = map(preprocess,data_from_many(datafiles)) #print("data generator created") i = 0 j = 0 k = 0 filename = DESTFILE.split('.')[0] + str(j) + '.json' file = open(filename,'w') #print("file created %s" % filename) for label, paragraphs in data: k+=1 #print('%i' % k) #print("label, paragraph extracted",end = '\r') if label != 'NOLABEL' and paragraphs != 'NOCONTENT': try: text_chunks = bunch_paragraphs(filter_paragraphs(FN,paragraphs),target_length=250) labels = [label for i in range(len(text_chunks))] items = zip(labels,text_chunks) for item in items: if len(item[1].split(' ')) >= 100: i+=1 print("%i datapoints written... %s\t%s" % (i,item[0],FN) + 50*' ',end='\n') file.write(json.dumps(item)+'\n') if i % MAX_PER_FILE == 0: j+=1 filename = DESTFILE.split('.')[0]+str(j) + '.json' file.close() file = open(filename,'w') except: pass file.close()