def parseSingleFile(self, fileName): f = open(fileName, 'r') if (".txt" not in fileName): return f1 = open(fileName.replace(".txt", "") + '.json', 'w') f1.write("{\"data\":[") prevLine = "" for x in f: if x.split(":")[ 0] == "submission": #if first line has submission info continue info = x.split("\",\"", 1) comment = Comment() body = info[0].replace("{\"body\":\"", "") body = body.replace("\\", "") comment.setBody(body.replace("\"", "")) if not prevLine is "": f1.write(prevLine + ",") #f1.write("{\"body\":\"" + ' '.join(comment.getBody().split()) + "\",\"") if len(info) < 2: continue prevLine = "{\"body\":\"" + ' '.join( comment.getBody().split()) + "\",\"" + info[1] #f1.write(info[1] + ",") #comment.setScore(int(info[3]))""" f1.write(prevLine) f1.write("]}") f1.close()
def parseSingleFile(self,fileName): f = open(fileName, 'r') if(".txt" not in fileName): return f1 = open(fileName.replace(".txt","") + '.json','w') f1.write("{\"data\":[") prevLine = "" for x in f: if x.split(":")[0] == "submission": #if first line has submission info continue info = x.split("\",\"",1) comment = Comment() body = info[0].replace("{\"body\":\"","") body = body.replace("\\","") comment.setBody(body.replace("\"","")) if not prevLine is "": f1.write(prevLine + ",") #f1.write("{\"body\":\"" + ' '.join(comment.getBody().split()) + "\",\"") if len(info) < 2: continue prevLine = "{\"body\":\"" + ' '.join(comment.getBody().split()) + "\",\"" + info[1] #f1.write(info[1] + ",") #comment.setScore(int(info[3]))""" f1.write(prevLine) f1.write("]}") f1.close()
def parseJSONFiles(self, fileNames): topicType = None topicName = "" if len(fileNames) == 0: return fact = TopicFactory() if "controversial" in fileNames[0]: splittedName = fileNames[0].split("top_") topicName = splittedName[1].split("_")[0] topicType = fact.factory("controvertial") else: splittedName = fileNames[0].split("top_") topicName = splittedName[1].split("_")[0] topicType = fact.factory("top") topicType.clearAuthors() topicType.setName(topicName) for fileName in fileNames: with open(fileName) as data_file: data = json.load(data_file) list1 = data['data'] if len(list1) == 0: print "File: " + str(fileName) + " is empty!" continue for comment in list1: userComment = Comment() if "[deleted]" in comment['body'] or "None" in comment[ 'author']: continue #comment was deleted: skip it userComment.setBody(comment['body']) userComment.setDepth(int(comment['depth'])) userComment.setScore(int(comment['score'])) userComment.setAuthor(comment['author']) userComment.setUps(int(comment['ups'])) userComment.setDowns(int(comment['downs'])) userComment.setControversiality(comment['controversiality']) userComment.setGilded(comment['gilded']) userComment.setName(comment['name']) userComment.setSubreddit(comment['subreddit']) userComment.setSubredditId(comment['subreddit_id']) userComment.setCreatedAt(int(comment['created'])) userComment.setCreatedUTC(int(comment['created_utc'])) userComment.setParentId(comment['parent_id']) topicType.addComment(userComment) topicType.setAuthors() topicType.setCommentFromId() self.topics.append(topicType)
def parseJSONFiles(self,fileNames): topicType = None topicName = "" if len(fileNames) == 0: return fact = TopicFactory() if "controversial" in fileNames[0]: splittedName = fileNames[0].split("top_") topicName = splittedName[1].split("_")[0] topicType = fact.factory("controvertial") else: splittedName = fileNames[0].split("top_") topicName = splittedName[1].split("_")[0] topicType = fact.factory("top") topicType.clearAuthors() topicType.setName(topicName) for fileName in fileNames: with open(fileName) as data_file: data = json.load(data_file) list1 = data['data'] if len(list1) == 0: print "File: " + str(fileName) + " is empty!" continue for comment in list1: userComment = Comment() if "[deleted]" in comment['body'] or "None" in comment['author']: continue #comment was deleted: skip it userComment.setBody(comment['body']) userComment.setDepth(int(comment['depth'])) userComment.setScore(int(comment['score'])) userComment.setAuthor(comment['author']) userComment.setUps(int(comment['ups'])) userComment.setDowns(int(comment['downs'])) userComment.setControversiality(comment['controversiality']) userComment.setGilded(comment['gilded']) userComment.setName(comment['name']) userComment.setSubreddit(comment['subreddit']) userComment.setSubredditId(comment['subreddit_id']) userComment.setCreatedAt(int(comment['created'])) userComment.setCreatedUTC(int(comment['created_utc'])) userComment.setParentId(comment['parent_id']) topicType.addComment(userComment) topicType.setAuthors() topicType.setCommentFromId() self.topics.append(topicType)