def __init__(self, *args, **kwargs): """ Initialization assumes that SENTTREE_PATH environment is set. """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["SENTTREE_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['SENTTREE_PATH']
def __init__(self,*args, **kwargs): """ Initialization assumes that IMDB_PATH environment is set. To set in linux or mac: export IMDB_PATH=/some_directory_containing_IMDB_data """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["IMDB_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['IMDB_PATH']
def __init__(self, *args, **kwargs): """ It reads he environment variable and initializes the base class. """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["REUTERS_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['REUTERS_PATH'] self.validationDict = {}
def __init__(self, *args, **kwargs): """ Initialization assumes that NEWSGROUP_PATH environment is set. To set in linux or mac: export NEWSGROUP_PATH=/some_directory_containing_newsgroup_data """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["NEWSGROUP_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['NEWSGROUP_PATH'] self.validationDict = {} self.topic_names = []
def __init__(self,*args, **kwargs): """ It reads he environment variable and initializes the base class. """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["DUC_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['DUC_PATH'] self.processed_filenames = [] self.processed_summaries = [] self.lambda_val = os.environ['DUC_LAMBDA'] self.diversity = os.environ['DUC_DIVERSITY'] self.duc_topic = os.environ['DUC_TOPIC'] self.document_id = 0
class IMDBReader(DocumentReader): """ IMDB Document Reader. Reads IMDB documents extracted from : """ def __init__(self,*args, **kwargs): """ Initialization assumes that IMDB_PATH environment is set. To set in linux or mac: export IMDB_PATH=/some_directory_containing_IMDB_data """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["IMDB_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['IMDB_PATH'] def readTopic(self): """ """ rootDir = "%s/train" %self.folderPath return self._getTopics(rootDir) def readDocument(self, ld): """ """ if ld <= 0: return 0 self.postgres_recorder.trucateTables() self.postgres_recorder.alterSequences() topic_names = self.readTopic() document_id = 0 for first_level_folder in next(os.walk(self.folderPath))[1]: if not(DocumentReader._folderISHidden(self, first_level_folder)): for topic in topic_names: if first_level_folder == 'test' and topic == 'unsup': continue for file_ in os.listdir("%s%s%s%s%s" %(self.folderPath, "/", \ first_level_folder, "/", topic)): doc_content = self._getTextFromFile("%s%s%s%s%s%s%s" \ %(self.folderPath, "/", first_level_folder, "/", topic, "/", file_)) document_id += 1 title, metadata, istrain = None, None, None try: trainortest = first_level_folder metadata = "SPLIT:%s"%trainortest istrain = 'YES' if trainortest.lower() == 'train' else 'NO' except: Logger.logr.info("NO MetaData or Train Test Tag") self.postgres_recorder.insertIntoDocTable(document_id, title, \ doc_content, file_, metadata) category = topic.split('.')[0] self.postgres_recorder.insertIntoDocTopTable(document_id, \ [topic], [category]) self._recordParagraphAndSentence(document_id, doc_content, self.postgres_recorder, topic, istrain) Logger.logr.info("Document reading complete.") return 1 def runBaselines(self): """ """ latent_space_size = 300 Logger.logr.info("Starting Running Para2vec (Doc) Baseline") # paraBaseline = Paragraph2VecSentenceRunner(self.dbstring) # paraBaseline.prepareData() # paraBaseline.runTheBaseline(latent_space_size) # Logger.logr.info("Starting Running Node2vec Baseline") # n2vBaseline = Node2VecRunner(self.dbstring) # n2vBaseline.prepareData() # paraBaseline.runEvaluationTask() # paraBaseline.runClassificationTask() #n2vBaseline.runTheBaseline(latent_space_size) #Logger.logr.info("Starting Running Iterative Update Method") #iterUdateBaseline = IterativeUpdateRetrofitRunner(self.dbstring) #iterUdateBaseline.prepareData() #iterUdateBaseline.runTheBaseline() #docBaseLine = Paragraph2VecRunner(self.dbstring) #docBaseLine.prepareData() #docBaseLine.runTheBaseline(latent_space_size) #docBaseLine.runEvaluationTask() #docBaseLine.runClassificationTask() docBaseLineCEXE = Paragraph2VecCEXERunner(self.dbstring) docBaseLineCEXE.prepareData() docBaseLineCEXE.runTheBaseline(latent_space_size) docBaseLineCEXE.runEvaluationTask() docBaseLineCEXE.runClassificationTask()
class SentimentTreeBank2WayReader(DocumentReader): def __init__(self, *args, **kwargs): """ Initialization assumes that SENTTREE_PATH environment is set. """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["SENTTREE_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['SENTTREE_PATH'] def readTopic(self): topic_names = ['pos', 'neg','unsup'] categories = ['pos', 'neg', 'unsup'] self.postgres_recorder.insertIntoTopTable(topic_names, categories) Logger.logr.info("[%i] Topic reading complete." %(len(topic_names))) return topic_names def readDSplit(self,fileName): """ 1 Train, 2 Test, 3 dev """ line_count = 0 dSPlitDict = {} for line in open(fileName, encoding='utf-8', errors='ignore'): if line_count == 0: pass else: doc_id,_, splitid = line.strip().partition(",") dSPlitDict[int(doc_id)] = int(splitid) line_count = line_count + 1 Logger.logr.info("Finished reading %i sentences and their splits"%line_count) return dSPlitDict; def readSentences(self,fileName): line_count = 0 sentenceDict = {} for line in open(fileName, encoding='utf-8', errors='ignore'): if line_count == 0: pass else: doc_id,_,sentence = line.strip().partition("\t") sentenceDict[int(doc_id)] = sentence.strip() line_count = line_count + 1 return sentenceDict Logger.logr.info("Finished reading %i sentence"%line_count) def phraseToSentiment(self, fileName): line_count = 0 phraseToSentimentDict = {} for line in open(fileName, encoding='utf-8', errors='ignore'): if line_count == 0: pass else: phrase_id,_, sentiment = line.strip().partition("|") phraseToSentimentDict[int(phrase_id)] = float(sentiment) line_count = line_count + 1 return phraseToSentimentDict Logger.logr.info("Finished reading %i phrases"%line_count) def getTopicCategory(self, sentiment_val): """ [0, 0.2] very negative (0.2, 0.4] negative (0.4, 0.6] neutral (0.6, 0.8] positive (0.8, 1.0] very positive """ if sentiment_val <=0.4: return ('neg', 'neg') elif sentiment_val >0.6: return ('pos', 'pos') else: return ('unsup', 'unsup') def readDocument(self, ld): """ SKip neutral phrases """ if ld <= 0: return 0 self.postgres_recorder.trucateTables() self.postgres_recorder.alterSequences() topic_names = self.readTopic() allPhrasesFile = "%s/dictionary.txt"%(self.folderPath) dSPlitDict = self.readDSplit("%s/datasetSplit.txt"%self.folderPath) sentenceDict = self.readSentences("%s/datasetSentences.txt"%self.folderPath) phraseToSentimentDict = self.phraseToSentiment("%s/sentiment_labels.txt"%self.folderPath) for line in open(allPhrasesFile, encoding='utf-8', errors='ignore'): phrase, _ , phrase_id = line.strip().partition("|") contains_in_train, contains_in_test, contains_in_dev, is_a_sentence = False, False, False, False sentiment_val = phraseToSentimentDict[int(phrase_id)] topic, category = self.getTopicCategory(sentiment_val) for sent_id, sentence in sentenceDict.items(): if phrase in sentence: train_label = dSPlitDict[sent_id] if train_label ==1: contains_in_train = True elif train_label==2: contains_in_test = True elif train_label==3: contains_in_dev = True if phrase==sentence: is_a_sentence = True # all neutrals are considered as part of training if sentiment_val >0.4 and sentiment_val<=0.6: metadata = "SPLIT:%s"%('unsup') istrain='MAYBE' elif contains_in_test==True and contains_in_train==False and\ contains_in_dev==False and is_a_sentence==True: metadata = "SPLIT:%s"%('test') istrain ="NO" elif contains_in_train ==True and contains_in_test==False and\ contains_in_dev == False: metadata = "SPLIT:%s"%('train') istrain='YES' else: metadata = "SPLIT:%s"%('unsup') istrain='MAYBE' topic, category ='unsup', 'unsup' self.postgres_recorder.insertIntoDocTable(phrase_id, "", \ phrase, "", metadata) self.postgres_recorder.insertIntoDocTopTable(phrase_id, \ [topic], [category]) self._recordParagraphAndSentence(phrase_id, phrase,\ self.postgres_recorder, topic, istrain) Logger.logr.info("Document reading complete.") return 1 def runBaselines(self): """ """ latent_space_size = 300 Logger.logr.info("Starting Running Para2vec (Doc) Baseline") # paraBaseline = Paragraph2VecSentenceRunner(self.dbstring) # paraBaseline.prepareData() # paraBaseline.runTheBaseline(latent_space_size) # Logger.logr.info("Starting Running Node2vec Baseline") # n2vBaseline = Node2VecRunner(self.dbstring) # n2vBaseline.prepareData() # paraBaseline.runEvaluationTask() # paraBaseline.runClassificationTask() #n2vBaseline.runTheBaseline(latent_space_size) #Logger.logr.info("Starting Running Iterative Update Method") #iterUdateBaseline = IterativeUpdateRetrofitRunner(self.dbstring) #iterUdateBaseline.prepareData() #iterUdateBaseline.runTheBaseline() #docBaseLine = Paragraph2VecRunner(self.dbstring) #docBaseLine.prepareData() #docBaseLine.runTheBaseline(latent_space_size) #docBaseLine.runEvaluationTask() #docBaseLine.runClassificationTask() docBaseLineCEXE = Paragraph2VecCEXERunner(self.dbstring) docBaseLineCEXE.prepareData() docBaseLineCEXE.runTheBaseline(latent_space_size) docBaseLineCEXE.runEvaluationTask() docBaseLineCEXE.runClassificationTask()
class NewsGroupReader(DocumentReader): """ News Group Document Reader. """ def __init__(self, *args, **kwargs): """ Initialization assumes that NEWSGROUP_PATH environment is set. To set in linux or mac: export NEWSGROUP_PATH=/some_directory_containing_newsgroup_data """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["NEWSGROUP_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['NEWSGROUP_PATH'] self.validationDict = {} self.topic_names = [] def __stripNewsgroupHeader(self, text): """ Given text in "news" format, strip the headers, by removing everything before the first blank line. """ _before, _blankline, after = text.partition('\n\n') return after def __stripNewsgroupQuoting(self, text): """ Given text in "news" format, strip lines beginning with the quote characters > or |, plus lines that often introduce a quoted section (for example, because they contain the string 'writes:'.) """ _QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:' r'|^In article|^Quoted from|^\||^>)') good_lines = [ line for line in text.split('\n') if not _QUOTE_RE.search(line) ] return '\n'.join(good_lines) def __stripNewsgroupFooter(self, text): """ Given text in "news" format, attempt to remove a signature block. As a rough heuristic, we assume that signatures are set apart by either a blank line or a line made of hyphens, and that it is the last such line in the file (disregarding blank lines at the end). """ lines = text.strip().split('\n') for line_num in range(len(lines) - 1, -1, -1): line = lines[line_num] if line.strip().strip('-') == '': break if line_num > 0: return '\n'.join(lines[:line_num]) else: return text def readTopic(self): """ http://pythoncentral.io/how-to-traverse-a-directory-tree-in-python-guide-to-os-walk/ """ rootDir = "%s/20news-bydate-train" % self.folderPath return self._getTopics(rootDir) def stripDocContent(self, doc_content): doc_content = self.__stripNewsgroupHeader(doc_content) doc_content = self.__stripNewsgroupFooter(doc_content) return self.__stripNewsgroupQuoting(doc_content) def __createValidationSet(self, document_ids): total_doc = len(document_ids) nvalid_doc = float(total_doc * 0.20) np.random.seed(2000) valid_list = np.random.choice(document_ids, nvalid_doc, replace=False).tolist() for id_ in valid_list: self.validationDict[id_] = 1 def __readAPass(self, load=0): if load == 0: self.topic_names = self.readTopic() train_doc_ids = [] document_id = 0 for first_level_folder in os.listdir(self.folderPath): if not (DocumentReader._folderISHidden(self, first_level_folder)): for topic in self.topic_names: if topic not in [ 'talk.politics.mideast', 'comp.graphics', 'soc.religion.christian', 'rec.autos', 'sci.space', 'talk.politics.guns', 'rec.sport.baseball', 'sci.med' ]: continue for file_ in os.listdir("%s%s%s%s%s" %(self.folderPath, "/", \ first_level_folder, "/", topic)): doc_content = self._getTextFromFile("%s%s%s%s%s%s%s" \ %(self.folderPath, "/", first_level_folder, "/", topic, "/", file_)) doc_content = self.stripDocContent(doc_content) document_id += 1 title, metadata, istrain = None, None, None try: trainortest = first_level_folder.split('-')[-1] metadata = "SPLIT:%s" % trainortest istrain = 'YES' if (trainortest.lower() == 'train') else 'NO' except: Logger.logr.info("NO MetaData or Train Test Tag") if istrain == 'YES': train_doc_ids.append(document_id) if document_id in self.validationDict: istrain = 'VALID' if load == 1: self.postgres_recorder.insertIntoDocTable(document_id, title, \ doc_content, file_, metadata) category = topic.split('.')[0] self.postgres_recorder.insertIntoDocTopTable(document_id, \ [topic], [category]) self._recordParagraphAndSentence( document_id, doc_content, self.postgres_recorder, topic, istrain) Logger.logr.info("A pass of the document reading complete.") return train_doc_ids def readDocument(self, ld): """ Stripping is by default inactive. For future reference it has been imported from scikit-learn newsgroup reader package. """ if ld <= 0: return 0 self.postgres_recorder.trucateTables() self.postgres_recorder.alterSequences() train_doc_ids = self.__readAPass(0) self.__createValidationSet(train_doc_ids) self.__readAPass(1) return 1 def runBaselines(self, pd, rbase, gs): """ """ #optDict = self._runClassificationOnValidation(pd, rbase, gs,"news") #self.doTesting(optDict, "news", rbase, pd, gs, True) optDict = self._runClusteringOnValidation(pd, rbase, gs, "news") self.doTesting(optDict, "news", rbase, pd, gs, False)
class DUCReader(DocumentReader): """ DUC Document Reader """ def __init__(self,*args, **kwargs): """ It reads he environment variable and initializes the base class. """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["DUC_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['DUC_PATH'] self.processed_filenames = [] self.processed_summaries = [] self.lambda_val = os.environ['DUC_LAMBDA'] self.diversity = os.environ['DUC_DIVERSITY'] self.duc_topic = os.environ['DUC_TOPIC'] self.document_id = 0 def readTopic(self): """ Recording DUC years as topics. """ topic_names = ['2001', '2002', '2003', '2004', '2005', '2006', '2007'] categories = topic_names self.postgres_recorder.insertIntoTopTable(topic_names, categories) Logger.logr.info("Topic reading complete.") def recordDocuments(self, documents, topic, summaryFileDict): docFileDict = {} for document in documents: filename = document.split(os.path.sep)[-1] #ft923-5089 if filename in self.processed_filenames: #don't store duplicate files continue if filename not in summaryFileDict: continue doc_content = self._getTextFromFile("%s" %(document)) soup = BeautifulSoup(doc_content, "html.parser") try: doc_content = soup.find('text').text.strip() except: Logger.logr.info("%s %s" %(document, "Skipping. Cause, TEXT tag not found")) continue if doc_content.count('.') > 1000 or doc_content.count('.') < 1: Logger.logr.info("%s %s" %(document, "Skipping. Cause, %s sentences." %doc_content.count('.'))) continue if len(doc_content.split()) < 100: continue self.processed_filenames += [filename] docFileDict [filename] = 1 self.document_id += 1 title, metadata, istrain = None, None, 'YES' self.postgres_recorder.insertIntoDocTable(self.document_id, title, \ doc_content, filename, metadata) category = topic.split('.')[0] self.postgres_recorder.insertIntoDocTopTable(self.document_id, \ [topic], [category]) self._recordParagraphAndSentence(self.document_id, doc_content, self.postgres_recorder, topic, istrain) return docFileDict def __recordSummariesA(self, summaries, document_dict): """ First check whether corresponding valid document is in the database """ for summary in summaries: doc_content = self._getTextFromFile("%s" %(summary)) soup = BeautifulSoup(doc_content, "html.parser") sums = soup.findAll('sum') for sum_ in sums: filename = sum_.get('docref') doc_content = sum_.text.strip() if filename not in document_dict: Logger.logr.info("Checking %s in document dict"%filename) continue method_id = 20 #DUC = 20 summarizer = sum_.get('summarizer') metadata = "SUMMARIZER:%s" %(summarizer) if "%s%s" %(filename, summarizer) in self.processed_summaries: continue self.processed_summaries += ["%s%s" %(filename, summarizer)] self.postgres_recorder.insertIntoGoldSumTable(filename, doc_content, \ method_id, metadata) def __getSummaryFileNames(self, summaryFile): doc_content = self._getTextFromFile("%s" %(summaryFile)) soup = BeautifulSoup(doc_content, "html.parser") summaries = soup.findAll('sum') filenames = [] for summary in summaries: filename = summary.get('docref') doc_content = summary.text if len(doc_content.split()) <100: continue else: filenames.append(filename) return filenames def __getValidSummaryFiles(self, summaries, summaryFileDict): for summary in summaries: fileNames = self.__getSummaryFileNames(summary) for names in fileNames: summaryFileDict[names] = 1 return summaryFileDict def __readDUC2001(self): """ It loads the DUC 2001 documents into the database. Check whether the number of words in the summary is less than 100, if yes then discard. As a rough heuristic, split the sentence and then count number of words. The function also makes sure that there will be no document without summary. """ topic = "2001" cur_path = "%s/%s" %(self.folderPath, "DUC2001") # Go one pass to collect all valid summary file names summaries, documents =[], [] for root, directories, files in os.walk(cur_path): documents += [os.path.join(root, file_) \ for file_ in files if file_ not in ['50', '100', '200', '400', 'perdocs']] summaries += [os.path.join(root, file_)\ for file_ in files if file_ in "perdocs"] summaryFileDict = {} summaryFileDict = self.__getValidSummaryFiles(summaries, summaryFileDict) Logger.logr.info("Got %i documents and %i summaries"%(len(documents), len(summaryFileDict))) Logger.logr.info("Recording DUC 2001 Documents.") docFileDict = self.recordDocuments(documents, topic, summaryFileDict) Logger.logr.info("%i elements in summary dict and %i"\ " elements in doc dict"%(len(summaryFileDict), len(docFileDict))) Logger.logr.info("Recording DUC 2001 Summaries.") self.__recordSummariesA(summaries, docFileDict) def __readDUC2002(self): """ It loads the DUC 2002 documents into the database. Check whether the number of words in the summary is less than 100, if yes then discard. As a rough heuristic, split the sentence and then count. The function also makes sure there will be no document without summary. """ topic = "2002" cur_path = "%s/%s" %(self.folderPath, "DUC2002") # Go one pass to collect all valid summary file names summaries, documents =[], [] for root, directories, files in os.walk(cur_path): documents += [os.path.join(root, file_) \ for file_ in files if file_ not in ['10', '50', '100', '200', '400', '200e', '400e', 'perdocs']] summaries += [os.path.join(root, file_)\ for file_ in files if file_ in "perdocs"] summaryFileDict = {} summaryFileDict = self.__getValidSummaryFiles(summaries, summaryFileDict) Logger.logr.info("Got %i documents and %i summaries"%(len(documents), len(summaryFileDict))) Logger.logr.info("Recording DUC 2002 Documents.") docFileDict = self.recordDocuments(documents, topic, summaryFileDict) Logger.logr.info("%i elements in summary dict and %i"\ " elements in doc dict"%(len(summaryFileDict), len(docFileDict))) Logger.logr.info("Recording DUC 2002 Summaries.") self.__recordSummariesA(summaries, docFileDict) def readDocument(self, ld): if ld <= 0: return 0 self.postgres_recorder.trucateTables() self.postgres_recorder.truncateSummaryTable() self.postgres_recorder.alterSequences() self.readTopic() document_id = 0 if self.duc_topic == str(2001): self.__readDUC2001() else: self.__readDUC2002() # document_id = self._readDUC2003(document_id) # document_id = self._readDUC2004(document_id) # document_id = self._readDUC2005(document_id) # document_id = self._readDUC2006(document_id) # document_id = self._readDUC2007(document_id) def __runSpecificEvaluation(self, models = [20], systems = []): rougeInstance = Rouge() rPDict = rougeInstance.buildRougeParamDict() rPDict['-l'] = str(100) rPDict['-c'] = str(0.99) evaluation = RankingEvaluation(topics = [self.duc_topic], models = models, systems = systems) evaluation._prepareFiles() evaluation._getRankingEvaluation(rPDict, rougeInstance) rPDict['-l'] = str(10) evaluation._getRankingEvaluation(rPDict, rougeInstance) def __runCombinedEvaluation(self): rougeInstance = Rouge() rPDict = rougeInstance.buildRougeParamDict() rPDict['-l'] = str(100) rPDict['-c'] = str(0.99) evaluation = RankingEvaluation(topics = [self.duc_topic], models = [20], systems = [1,2,3,4,5,6,7,9,10,11,12,21]) evaluation._prepareFiles() evaluation._getRankingEvaluation(rPDict, rougeInstance) rPDict['-l'] = str(10) evaluation._getRankingEvaluation(rPDict, rougeInstance) def __getRecall(self, method_id, models, systems): output_file_name = "" for model in models: output_file_name += str(model)+"_" for system in systems: output_file_name += "_"+str(system) output_file_name += "_output" output_file_name += "_%s.txt" %(str(10)) with open('%s%s%s' %(os.environ["SUMMARYFOLDER"],"/",output_file_name), 'r') as f: content = f.read() recall = float(content.split("%s ROUGE-1 Average_R: " %method_id)[1].split(' ')[0]) return recall def runBaselines(self, pd, rbase, gs): """ """ ############# Validation ############################ with open('%s%s%s%s' %(os.environ["TRTESTFOLDER"],"/",self.duc_topic,"_hyperparameters.txt"), 'w') as f: latent_space_size = 300 diversity = False if self.diversity == str(1): diversity = True # createValidationSet() Need to implement this function os.environ['DUC_EVAL']='VALID' recalls = {} window_opt = None #var for the optimal window for window in ["8", "10", "12"]: #for window in ["8"]: Logger.logr.info("Starting Running Para2vec Baseline for Window = %s" %window) self.postgres_recorder.truncateSummaryTable() paraBaseline = P2VSENTCExecutableRunner(self.dbstring) if window=="8": paraBaseline.prepareData(pd) paraBaseline.runTheBaseline(rbase,latent_space_size, window) paraBaseline.generateSummary(gs,\ lambda_val=self.lambda_val, diversity=diversity) paraBaseline.doHouseKeeping() self.__runSpecificEvaluation(models = [20], systems = [2]) #Running Rouge for method_id = 2 only recalls[window] = self.__getRecall(method_id=2, models = [20], systems = [2]) Logger.logr.info("Recall for %s = %s" %(window, recalls[window])) window_opt = max(recalls, key=recalls.get) #get the window for the max recall f.write("Optimal window size is %s%s"%(window_opt, os.linesep)) f.write("P2V Window Recalls: %s%s" %(recalls, os.linesep)) f.flush() Logger.logr.info("Starting Running Para2vec Baseline for Optimal Window = %s" %window_opt) self.postgres_recorder.truncateSummaryTable() paraBaseline = P2VSENTCExecutableRunner(self.dbstring) paraBaseline.runTheBaseline(rbase,latent_space_size, window_opt) #we need the p2v vectors created with optimal window paraBaseline.doHouseKeeping() recalls = {} beta_opt = None #var for the optimal beta for beta in ["0.3", "0.6", "0.9","1.0"]: #for beta in ["0.3"]: Logger.logr.info("Starting Running Node2vec Baseline for Beta = %s" %beta) self.postgres_recorder.truncateSummaryTable() n2vBaseline = Node2VecRunner(self.dbstring) n2vBaseline.mybeta = beta #reinitializing mybeta generate_walk = False if beta=="0.3": n2vBaseline.prepareData(pd) generate_walk = True n2vBaseline.runTheBaseline(rbase, latent_space_size, generate_walk) n2vBaseline.generateSummary(gs, 5, "_retrofit",\ lambda_val=self.lambda_val, diversity=diversity) n2vBaseline.doHouseKeeping() self.__runSpecificEvaluation(models = [20], systems = [5]) #Running Rouge for method_id = 5 only recalls[beta] = self.__getRecall(method_id=5, models = [20], systems = [5]) Logger.logr.info("Recall for %s = %s" %(beta, recalls[beta])) beta_opt = max(recalls, key=recalls.get) #get the beta for the max recall f.write("Optimal Beta is %s%s"%(beta_opt, os.linesep)) f.write("N2V Beta Recalls: %s%s" %(recalls, os.linesep)) f.flush() recalls = {} alpha_opt = None #var for the optimal beta for alpha in [0.3, 0.6, 0.8, 1.0]: #for alpha in [0.3]: Logger.logr.info("Starting Running Iterative Baseline for Alpha = %s" %alpha) self.postgres_recorder.truncateSummaryTable() iterrunner = IterativeUpdateRetrofitRunner(self.dbstring) iterrunner.myalpha = alpha #reinitializing myalpha if alpha==0.3: iterrunner.prepareData(pd) iterrunner.runTheBaseline(rbase) iterrunner.generateSummary(gs, 7, "_weighted",\ lambda_val=self.lambda_val, diversity=diversity) iterrunner.doHouseKeeping() self.__runSpecificEvaluation(models = [20], systems = [7]) recalls[alpha] = self.__getRecall(method_id=7, models = [20], systems = [7]) Logger.logr.info("Recall for %s = %s" %(alpha, recalls[alpha])) alpha_opt = max(recalls, key=recalls.get) #get the alpha for the max recall Logger.logr.info("Optimal Alpha=%s" %alpha_opt) f.write("Optimal alpha is %.2f%s"%(alpha_opt, os.linesep)) f.write("ITR Alpha Recalls: %s%s" %(recalls, os.linesep)) f.flush() w_recalls = {} unw_recalls = {} w_opt = None unw_opt = None for beta in [0.3, 0.6, 0.8, 1.0]: #for beta in [0.3]: Logger.logr.info("Starting Running Regularized Baseline for Beta = %s" %beta) self.postgres_recorder.truncateSummaryTable() regs2v = RegularizedSen2VecRunner(self.dbstring) regs2v.regBetaW = beta regs2v.regBetaUNW = beta if beta==0.3: regs2v.prepareData(pd) regs2v.runTheBaseline(rbase, latent_space_size) regs2v.generateSummary(gs,9,"_neighbor_w",\ lambda_val=self.lambda_val, diversity=diversity) regs2v.generateSummary(gs,10,"_neighbor_unw",\ lambda_val=self.lambda_val, diversity=diversity) regs2v.doHouseKeeping() self.__runSpecificEvaluation(models = [20], systems = [9, 10]) w_recalls[beta] = self.__getRecall(method_id=9, models = [20], systems = [9, 10]) unw_recalls[beta] = self.__getRecall(method_id=10, models = [20], systems = [9, 10]) Logger.logr.info("W_Recall for %s = %s" %(beta, w_recalls[beta])) Logger.logr.info("UNW_Recall for %s = %s" %(beta, unw_recalls[beta])) w_opt_reg = max(w_recalls, key=w_recalls.get) unw_opt_reg = max(unw_recalls, key=unw_recalls.get) Logger.logr.info("Optimal regBetaW=%s and regBetaUNW=%s" %(w_opt_reg, unw_opt_reg)) f.write("Optimal REG BetaW : %.2f%s" %(w_opt_reg, os.linesep)) f.write("Optimal REG BetaUNW : %.2f%s" %(unw_opt_reg, os.linesep)) f.write("REG BetaW Recalls: %s%s" %(w_recalls, os.linesep)) f.write("REG BetaUNW Recalls: %s%s" %(unw_recalls, os.linesep)) f.flush() w_recalls = {} unw_recalls = {} w_opt = None unw_opt = None for beta in [0.3, 0.6, 0.8, 1.0]: #for beta in [0.3]: Logger.logr.info("Starting Running Dict Regularized Baseline for Beta = %s" %beta) self.postgres_recorder.truncateSummaryTable() dictregs2v = DictRegularizedSen2VecRunner(self.dbstring) dictregs2v.dictregBetaW = beta dictregs2v.dictregBetaUNW = beta if beta==0.3: dictregs2v.prepareData(pd) dictregs2v.runTheBaseline(rbase, latent_space_size) dictregs2v.generateSummary(gs,11,"_neighbor_w",\ lambda_val=self.lambda_val, diversity=diversity) dictregs2v.generateSummary(gs,12,"_neighbor_unw",\ lambda_val=self.lambda_val, diversity=diversity) dictregs2v.doHouseKeeping() self.__runSpecificEvaluation(models = [20], systems = [11, 12]) w_recalls[beta] = self.__getRecall(method_id=11, models = [20], systems = [11, 12]) unw_recalls[beta] = self.__getRecall(method_id=12, models = [20], systems = [11, 12]) Logger.logr.info("W_Recall for %s = %s" %(beta, w_recalls[beta])) Logger.logr.info("UNW_Recall for %s = %s" %(beta, unw_recalls[beta])) w_opt_dict_reg = max(w_recalls, key=w_recalls.get) unw_opt_dict_reg = max(unw_recalls, key=unw_recalls.get) Logger.logr.info("Optimal dictregBetaW=%s and dictregBetaUNW=%s" %(w_opt_dict_reg, unw_opt_dict_reg)) f.write("DCT BetaW: %.2f%s" %(w_opt_dict_reg, os.linesep)) f.write("DCT BetaUNW: %.2f%s" %(unw_opt_dict_reg, os.linesep)) f.write("DCT BetaW Recalls: %s%s" %(w_recalls, os.linesep)) f.write("DCT BetaUNW Recalls: %s%s" %(unw_recalls, os.linesep)) f.flush() ######## Test ######################################## os.environ["DUC_EVAL"]='TEST' niter = 5 for i in range(0,niter): f.write("###### Iteration: %s ######%s" %(i, os.linesep)) f.write("Optimal Window: %s%s" %(window_opt, os.linesep)) self.postgres_recorder.truncateSummaryTable() paraBaseline = P2VSENTCExecutableRunner(self.dbstring) paraBaseline.runTheBaseline(rbase,latent_space_size, window_opt) #we need the p2v vectors created with optimal window paraBaseline.generateSummary(gs,\ lambda_val=self.lambda_val, diversity=diversity) paraBaseline.doHouseKeeping() f.flush() f.write("Optimal Beta: %s%s" %(beta_opt, os.linesep)) n2vBaseline = Node2VecRunner(self.dbstring) n2vBaseline.mybeta = beta_opt generate_walk = False n2vBaseline.runTheBaseline(rbase, latent_space_size, generate_walk) n2vBaseline.generateSummary(gs, 3, "",\ lambda_val=self.lambda_val, diversity=diversity) n2vBaseline.generateSummary(gs, 4, "_init",\ lambda_val=self.lambda_val, diversity=diversity) n2vBaseline.generateSummary(gs, 5, "_retrofit",\ lambda_val=self.lambda_val, diversity=diversity) n2vBaseline.doHouseKeeping() f.flush() f.write("Optimal alpha: %.2f%s" %(alpha_opt, os.linesep)) iterrunner = IterativeUpdateRetrofitRunner(self.dbstring) iterrunner.myalpha = alpha_opt #reinitializing myalpha iterrunner.runTheBaseline(rbase) iterrunner.generateSummary(gs, 6, "_unweighted",\ lambda_val=self.lambda_val, diversity=diversity) iterrunner.generateSummary(gs, 7, "_weighted",\ lambda_val=self.lambda_val, diversity=diversity) iterrunner.doHouseKeeping() f.write("Optimal regBetaW: %.2f%s" %(w_opt_reg, os.linesep)) f.write("Optimal regBetaUNW: %.2f%s" %(unw_opt_reg, os.linesep)) regs2v = RegularizedSen2VecRunner(self.dbstring) regs2v.regBetaW = w_opt_reg regs2v.regBetaUNW = unw_opt_reg regs2v.runTheBaseline(rbase, latent_space_size) regs2v.generateSummary(gs,9,"_neighbor_w",\ lambda_val=self.lambda_val, diversity=diversity) regs2v.generateSummary(gs,10,"_neighbor_unw",\ lambda_val=self.lambda_val, diversity=diversity) regs2v.doHouseKeeping() f.flush() f.write("Optimal regBetaW: %.2f%s" %(w_opt_dict_reg, os.linesep)) f.write("Optimal regBetaUNW: %.2f%s" %(unw_opt_dict_reg, os.linesep)) dictregs2v = DictRegularizedSen2VecRunner(self.dbstring) dictregs2v.dictregBetaW = w_opt_dict_reg dictregs2v.dictregBetaUNW = unw_opt_dict_reg dictregs2v.runTheBaseline(rbase, latent_space_size) dictregs2v.generateSummary(gs,11,"_neighbor_w",\ lambda_val=self.lambda_val, diversity=diversity) dictregs2v.generateSummary(gs,12,"_neighbor_unw",\ lambda_val=self.lambda_val, diversity=diversity) dictregs2v.doHouseKeeping() f.flush() self.__runCombinedEvaluation() #20__1_2_3_4_5_6_7_9_10_11_12_21_output_100.txt #20__1_2_3_4_5_6_7_9_10_11_12_21_output_10.txt f.write ("%s%s"%("#########################Running for Test (100) ###########################################", os.linesep)) file_ = os.path.join(os.environ["SUMMARYFOLDER"],"20__1_2_3_4_5_6_7_9_10_11_12_21_output_100.txt") for line in open(file_): f.write(line) f.flush() f.write ("%s%s"%("#########################Running for Test (10) ###########################################", os.linesep)) file_ = os.path.join(os.environ["SUMMARYFOLDER"], "20__1_2_3_4_5_6_7_9_10_11_12_21_output_10.txt") for line in open(file_): f.write(line) f.write("%s%s"%(os.linesep, os.linesep)) f.flush()
class ReutersReader(DocumentReader): """ Reuters Document Reader """ def __init__(self, *args, **kwargs): """ It reads he environment variable and initializes the base class. """ DocumentReader.__init__(self, *args, **kwargs) self.dbstring = os.environ["REUTERS_DBSTRING"] self.postgres_recorder = PostgresDataRecorder(self.dbstring) self.folderPath = os.environ['REUTERS_PATH'] self.validationDict = {} def __recordDocumentTopic(self, document_id, doc): """ """ topic_names = [] categories = [] possible_categories = [ "topics", "places", "people", "orgs", "exchanges", "companies" ] for category in possible_categories: try: topics = doc.find(category).findAll('d') for topic in topics: topic = topic.text.strip() topic_names += [topic] categories += [category] except: pass self.postgres_recorder.insertIntoDocTopTable(document_id,\ topic_names, categories) def readTopic(self): """ """ topic_names = [] categories = [] for file_ in os.listdir(self.folderPath): if file_.endswith(".lc.txt"): category = file_.split('-')[1] content = open("%s%s%s" % (self.folderPath, "/", file_), 'r', encoding='utf-8', errors='ignore').read() for topic in content.split(os.linesep): topic = topic.strip() if len(topic) != 0: topic_names += [topic] categories += [category] self.postgres_recorder.insertIntoTopTable(topic_names, categories) Logger.logr.info("Topic reading complete.") def _getTopic(self, document_id, doc): """ Interested topic: acq, money-fx, crude, trade, interest. A topic can be one of the interested topic. A topic is assigned based on the order if multiple interested topics are assigned for a particular document. We take top-10 frequent topics mentioned in "Text Categorization with support vector machines: Learning with many relevant features." """ interested_topic_list = ['earn', 'acq', 'money-fx', 'grain', 'crude', 'trade'\ ,'interest', 'ship', 'wheat', 'corn'] topics = doc.find("topics").findAll('d') for topic in topics: topic = topic.text.strip() if topic in interested_topic_list: return topic return "other" def __createValidationSet(self, document_ids): total_doc = len(document_ids) nvalid_doc = float(total_doc * 0.20) np.random.seed(2000) valid_list = np.random.choice(document_ids, nvalid_doc, replace=False).tolist() for id_ in valid_list: self.validationDict[id_] = 1 def __readAPass(self, load): self.postgres_recorder.trucateTables() self.postgres_recorder.alterSequences() self.readTopic() train_doc_ids = [] for file_ in os.listdir(self.folderPath): if file_.endswith(".sgm"): file_content = self._getTextFromFile( "%s%s%s" % (self.folderPath, "/", file_)) soup = BeautifulSoup(file_content, "html.parser") for doc in soup.findAll('reuters'): document_id = doc['newid'] title = doc.find('title').text if doc.find('title') \ is not None else None doc_content = doc.find('text').text if doc.find('text')\ is not None else None try: metadata = "OLDID:"+doc['oldid']+"^"+"TOPICS:"+doc['topics']+\ "^"+"CGISPLIT:"+doc['cgisplit']+"^"+"LEWISSPLIT:"+doc['lewissplit'] if doc['lewissplit'] == "NOT-USED" or doc['topics'] == "NO"\ or doc['topics'] == "BYPASS" : Logger.logr.info( "SKipping because of ModApte Split") continue except: metadata = None continue topic = self._getTopic(document_id, doc) if topic in ['wheat', 'corn', 'other']: continue #if topic not in ['ship','interest']: # continue istrain = 'YES' if doc['lewissplit'].lower( ) == 'train' else 'NO' if document_id in self.validationDict: istrain = 'VALID' if istrain == 'YES': train_doc_ids.append(document_id) if load == 0: continue self.postgres_recorder.insertIntoDocTable(document_id, title, \ doc_content, file_, metadata) self.__recordDocumentTopic(document_id, doc) self._recordParagraphAndSentence(document_id, doc_content, self.postgres_recorder, topic, istrain) return train_doc_ids Logger.logr.info("[Pass 1] Document reading complete.") def readDocument(self, ld): """ First, reading and recording the Topics. Second, recording each document at a time Third, for each document, record the lower level information like: paragraph, sentences in table """ if ld <= 0: return 0 train_doc_ids = self.__readAPass(0) self.__createValidationSet(train_doc_ids) self.__readAPass(1) return 1 def runBaselines(self, pd, rbase, gs): """ """ optDict = self._runClassificationOnValidation(pd, rbase, gs, "reuter") self.doTesting(optDict, "reuter", rbase, pd, gs, True) optDict = self._runClusteringOnValidation(pd, rbase, gs, "reuter") self.doTesting(optDict, "reuter", rbase, pd, gs, False)