Пример #1
0
def get_blog(directory):

	path = os.getcwd()
	amount = 0
	corpus = []
	index = []
	i = 0

	for i in range(len(directory)):
		with codecs.open(path+'/blogs/'+directory[i], "r",encoding='utf-8', errors='ignore') as file:
			blog = file.read().split()
			posts = getData.extract_post(blog)
			index.append(int(directory[i].split(".")[0]))
			corpus.append(posts)
			amount += 1
			print("blog "+str(amount))

	train = []
	test = []
	p = 0.7

	#70 - 30
	for i in range(len(corpus)):
		size = math.floor(len(corpus[i])*p)
		train.append(getData.concat(corpus[i][:size]))
		test.append(getData.concat(corpus[i][size:]))

	return train, test, index
Пример #2
0
def write_author_names():

    path = os.getcwd()
    directory = np.array([x[2] for x in os.walk(path + '/blogs')][0])

    author_names = []

    with open("author_post.csv", "w") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow(["Author", "Npost"])

        for i in range(len(directory)):
            with codecs.open(path + '/blogs/' + directory[i], "r", encoding='utf-8', errors='ignore') as file:
                blog = file.read().split()
                posts = getData.extract_post(blog)

                writer.writerow([directory[i], len(posts)])

            print(i)