def addToVocab(word, answer_id, priority_type, priority_val): if len(VOCAB_DICT) >= VOCAB_DICT_LIMIT: # create a temporary index file outfilePath = OUTPUT_DIR + TEMP_FILE_PREFIX + str(TEMP_FILE_COUNTER) FileUtil.writeToFile(VOCAB_DICT, outfilePath) TEMP_FILE_COUNTER = TEMP_FILE_COUNTER + 1 VOCAB_DICT.clear() if word in VOCAB_DICT: answers = VOCAB_DICT[word] if answer_id in answers: prts = answers[answer_id] if priority_type in prts: prts[priority_type] = prts[priority_type] + priority_val else: prts[priority_type] = priority_val else: answers[answer_id] = {priority_type: priority_val} else: VOCAB_DICT[word] = {answer_id: {priority_type: priority_val}}
else: prts[priority_type] = priority_val else: answers[answer_id] = {priority_type: priority_val} else: VOCAB_DICT[word] = {answer_id: {priority_type: priority_val}} def processText(text, priority): tFreq = Counter([stemmer.stem(kw) for kw in TextUtil.cleanUpText(text).split()]) for tWord in tFreq: if tWord not in STOP_WORDS and len(tWord) > 2: addToVocab(tWord, ans_id, priority, tFreq[tWord]) linecount = 0 for (title, ques, ans) in FileUtil.readCMUQAData(qaFile1): ans_id = MongoUtil.getAnswerID(ans) MongoUtil.saveQARelation(ques, ans_id) # Work on title processText(title, 'p') # Work on ques processText(ques, 'q') # Work on ans processText(ans, 'r') linecount = linecount + 1 if linecount % 100 == 0: print "Read %d QA data..." % linecount
else: prts[priority_type] = priority_val else: answers[answer_id] = {priority_type: priority_val} else: VOCAB_DICT[word] = {answer_id: {priority_type: priority_val}} def processText(text, priority): tFreq = Counter([stemmer.stem(kw) for kw in TextUtil.cleanUpText(text).split()]) for tWord in tFreq: if tWord not in STOP_WORDS and len(tWord) > 2: addToVocab(tWord, ans_id, priority, tFreq[tWord]) linecount = 0 for (title, ques, ans) in FileUtil.readNistQAData(qaFilesDir): ans_id = MongoUtil.getAnswerID(ans) MongoUtil.saveQARelation(ques, ans_id) # Work on title processText(title, 'p') # Work on ques processText(ques, 'q') # Work on ans processText(ans, 'r') linecount = linecount + 1 if linecount % 100 == 0: print "Read %d QA files..." % linecount
else: answers[answer_id] = {priority_type: priority_val} else: VOCAB_DICT[word] = {answer_id: {priority_type: priority_val}} def processText(text, priority): tFreq = Counter( [stemmer.stem(kw) for kw in TextUtil.cleanUpText(text).split()]) for tWord in tFreq: if tWord not in STOP_WORDS and len(tWord) > 2: addToVocab(tWord, ans_id, priority, tFreq[tWord]) linecount = 0 for (title, ques, ans) in FileUtil.readJeopardyQAData(qaFilePath): ans_id = MongoUtil.getAnswerID(ans) MongoUtil.saveQARelation(ques, ans_id) # Work on title processText(title, 'p') # Work on ques processText(ques, 'q') # Work on ans processText(ans, 'r') linecount = linecount + 1 if linecount % 100 == 0: print "Read %d QA files..." % linecount
SE_ANSID_COUNTER) #MongoUtil.getAnswerID(ans) SE_ANSID_COUNTER += 1 #MongoUtil.saveQARelation(ques, ans_id) ansid_file.write( (ans_id + '\t' + ques + '\t' + ans + '\n').encode('utf-8')) # Work on title processText(title, 'p') # Work on ques processText(ques, 'q') # Work on ans processText(ans, 'r') linecount = linecount + 1 if linecount % 1000 == 0: print "time taken to read %d : " % linecount, time.time() - start_t start_t = time.time() # Write to file if its still left if len(VOCAB_DICT) > 0: # create a temporary index file outfilePath = OUTPUT_DIR + TEMP_FILE_PREFIX + str(TEMP_FILE_COUNTER) FileUtil.writeToFile(VOCAB_DICT, outfilePath) VOCAB_DICT.clear() ansid_file.close() print "Total time : ", time.time() - tot_t print 'Done!'
else: answers[answer_id] = {priority_type: priority_val} else: VOCAB_DICT[word] = {answer_id: {priority_type: priority_val}} def processText(text, priority): tFreq = Counter( [stemmer.stem(kw) for kw in TextUtil.cleanUpText(text).split()]) for tWord in tFreq: if tWord not in STOP_WORDS and len(tWord) > 2: addToVocab(tWord, ans_id, priority, tFreq[tWord]) linecount = 0 for (title, ques, ans) in FileUtil.readNistQAData(qaFilesDir): ans_id = MongoUtil.getAnswerID(ans) MongoUtil.saveQARelation(ques, ans_id) # Work on title processText(title, 'p') # Work on ques processText(ques, 'q') # Work on ans processText(ans, 'r') linecount = linecount + 1 if linecount % 100 == 0: print "Read %d QA files..." % linecount
ans_id = SE_ANSID_PREFIX + str(SE_ANSID_COUNTER) #MongoUtil.getAnswerID(ans) SE_ANSID_COUNTER += 1 #MongoUtil.saveQARelation(ques, ans_id) ansid_file.write((ans_id + '\t' + ques + '\t' + ans + '\n').encode('utf-8')) # Work on title processText(title, 'p') # Work on ques processText(ques, 'q') # Work on ans processText(ans, 'r') linecount = linecount + 1 if linecount % 1000 == 0: print "time taken to read %d : " % linecount, time.time() - start_t start_t = time.time() # Write to file if its still left if len(VOCAB_DICT) > 0: # create a temporary index file outfilePath = OUTPUT_DIR + TEMP_FILE_PREFIX + str(TEMP_FILE_COUNTER) FileUtil.writeToFile(VOCAB_DICT, outfilePath) VOCAB_DICT.clear() ansid_file.close() print "Total time : ", time.time() - tot_t print 'Done!'
def startElement(self, name, attrs): if name == 'row': try: if attrs['PostTypeId'] == '1' and 'AcceptedAnswerId' in attrs: # Handle questions qid = self.id_prefix + '.' + attrs['Id'] body = TextUtil.strip_tags(attrs['Body']) title = attrs['Title'] aid = self.id_prefix + '.' + attrs['AcceptedAnswerId'] MongoUtil.saveSEQuestion(qid, body, title, aid) else: # Handle answers aid = self.id_prefix + '.' + attrs['Id'] body = TextUtil.strip_tags(attrs['Body']) MongoUtil.saveSEAnswer(aid, body) except: pass parser = xml.sax.make_parser() for cfile in ['/home/brij/Documents/moody/datasets/stackexchange_data/stackoverflow/stackoverflow.com-Posts.7z']: #FileUtil.getSO7zFiles("/home/brij/Documents/moody/datasets/stackexchange_data/"): fname = FileUtil.getFilenameWithoutExt(cfile) print "Extracting %s ..." % fname FileUtil.extractPostsXml(cfile, OUTPUT_FOLDER) print "Parsing %s ..." % fname parser.setContentHandler(SEPostXmlHandler(fname)) parser.parse(open(OUTPUT_XML, 'r')) print "Done parsing %s ..." % fname