def get_blog(directory): path = os.getcwd() amount = 0 corpus = [] index = [] i = 0 for i in range(len(directory)): with codecs.open(path+'/blogs/'+directory[i], "r",encoding='utf-8', errors='ignore') as file: blog = file.read().split() posts = getData.extract_post(blog) index.append(int(directory[i].split(".")[0])) corpus.append(posts) amount += 1 print("blog "+str(amount)) train = [] test = [] p = 0.7 #70 - 30 for i in range(len(corpus)): size = math.floor(len(corpus[i])*p) train.append(getData.concat(corpus[i][:size])) test.append(getData.concat(corpus[i][size:])) return train, test, index
def write_author_names(): path = os.getcwd() directory = np.array([x[2] for x in os.walk(path + '/blogs')][0]) author_names = [] with open("author_post.csv", "w") as csv_file: writer = csv.writer(csv_file, delimiter=',') writer.writerow(["Author", "Npost"]) for i in range(len(directory)): with codecs.open(path + '/blogs/' + directory[i], "r", encoding='utf-8', errors='ignore') as file: blog = file.read().split() posts = getData.extract_post(blog) writer.writerow([directory[i], len(posts)]) print(i)