Exemplo n.º 1
0
	def __init__(self, *args, **kwargs):
		"""
		Initialization assumes that SENTTREE_PATH environment is set. 
		"""
		DocumentReader.__init__(self, *args, **kwargs)
		self.dbstring = os.environ["SENTTREE_DBSTRING"]
		self.postgres_recorder = PostgresDataRecorder(self.dbstring)
		self.folderPath = os.environ['SENTTREE_PATH']
Exemplo n.º 2
0
	def __init__(self,*args, **kwargs):
		"""
		Initialization assumes that IMDB_PATH environment is set. 
		To set in linux or mac: export IMDB_PATH=/some_directory_containing_IMDB_data
		"""
		DocumentReader.__init__(self, *args, **kwargs)
		self.dbstring = os.environ["IMDB_DBSTRING"]
		self.postgres_recorder = PostgresDataRecorder(self.dbstring)
		self.folderPath = os.environ['IMDB_PATH']
Exemplo n.º 3
0
    def __init__(self, *args, **kwargs):
        """
		It reads he environment variable and initializes the 
		base class. 
		"""
        DocumentReader.__init__(self, *args, **kwargs)
        self.dbstring = os.environ["REUTERS_DBSTRING"]
        self.postgres_recorder = PostgresDataRecorder(self.dbstring)
        self.folderPath = os.environ['REUTERS_PATH']
        self.validationDict = {}
Exemplo n.º 4
0
    def __init__(self, *args, **kwargs):
        """
		Initialization assumes that NEWSGROUP_PATH environment is set. 
		To set in linux or mac: export NEWSGROUP_PATH=/some_directory_containing_newsgroup_data
		"""
        DocumentReader.__init__(self, *args, **kwargs)
        self.dbstring = os.environ["NEWSGROUP_DBSTRING"]
        self.postgres_recorder = PostgresDataRecorder(self.dbstring)
        self.folderPath = os.environ['NEWSGROUP_PATH']
        self.validationDict = {}
        self.topic_names = []
Exemplo n.º 5
0
	def __init__(self,*args, **kwargs):
		"""
		It reads he environment variable and initializes the 
		base class. 
		"""
		DocumentReader.__init__(self, *args, **kwargs)
		self.dbstring = os.environ["DUC_DBSTRING"]
		self.postgres_recorder = PostgresDataRecorder(self.dbstring)
		self.folderPath = os.environ['DUC_PATH']
		self.processed_filenames = []
		self.processed_summaries = []
		self.lambda_val = os.environ['DUC_LAMBDA']
		self.diversity = os.environ['DUC_DIVERSITY']
		self.duc_topic = os.environ['DUC_TOPIC']
		self.document_id = 0
Exemplo n.º 6
0
    def __readAPass(self, load=0):
        if load == 0:
            self.topic_names = self.readTopic()

        train_doc_ids = []
        document_id = 0
        for first_level_folder in os.listdir(self.folderPath):
            if not (DocumentReader._folderISHidden(self, first_level_folder)):
                for topic in self.topic_names:
                    if topic not in [
                            'talk.politics.mideast', 'comp.graphics',
                            'soc.religion.christian', 'rec.autos', 'sci.space',
                            'talk.politics.guns', 'rec.sport.baseball',
                            'sci.med'
                    ]:
                        continue
                    for file_ in os.listdir("%s%s%s%s%s" %(self.folderPath, "/", \
                          first_level_folder, "/", topic)):
                        doc_content = self._getTextFromFile("%s%s%s%s%s%s%s" \
                         %(self.folderPath, "/", first_level_folder, "/", topic, "/", file_))

                        doc_content = self.stripDocContent(doc_content)

                        document_id += 1
                        title, metadata, istrain = None, None, None
                        try:
                            trainortest = first_level_folder.split('-')[-1]
                            metadata = "SPLIT:%s" % trainortest
                            istrain = 'YES' if (trainortest.lower()
                                                == 'train') else 'NO'
                        except:
                            Logger.logr.info("NO MetaData or Train Test Tag")

                        if istrain == 'YES':
                            train_doc_ids.append(document_id)

                        if document_id in self.validationDict:
                            istrain = 'VALID'

                        if load == 1:
                            self.postgres_recorder.insertIntoDocTable(document_id, title, \
                               doc_content, file_, metadata)
                            category = topic.split('.')[0]
                            self.postgres_recorder.insertIntoDocTopTable(document_id, \
                               [topic], [category])
                            self._recordParagraphAndSentence(
                                document_id, doc_content,
                                self.postgres_recorder, topic, istrain)

        Logger.logr.info("A pass of the document reading complete.")
        return train_doc_ids
Exemplo n.º 7
0
    def readDocument(self, ld):
        """
		"""
        if ld <= 0: return 0
        self.postgres_recorder.trucateTables()
        self.postgres_recorder.alterSequences()
        topic_names = self.readTopic()

        document_id = 0
        for first_level_folder in next(os.walk(self.folderPath))[1]:
            if not (DocumentReader._folderISHidden(self, first_level_folder)):
                for topic in topic_names:
                    for file_ in os.listdir("%s%s%s%s%s" %(self.folderPath, "/", \
                          first_level_folder, "/", topic)):
                        file_content = self._getTextFromFile("%s%s%s%s%s%s%s" \
                         %(self.folderPath, "/", first_level_folder, "/", topic, "/", file_))

                        file_content = file_content.split("%s" % os.linesep)
                        for doc_content in file_content:
                            document_id += 1
                            title, metadata, istrain = None, None, None
                            try:
                                trainortest = first_level_folder
                                metadata = "SPLIT:%s" % trainortest
                                istrain = 'YES' if trainortest.lower(
                                ) == 'train' else 'NO'
                            except:
                                Logger.logr.info(
                                    "NO MetaData or Train Test Tag")
                            self.postgres_recorder.insertIntoDocTable(document_id, title, \
                               doc_content, file_, metadata)
                            category = topic.split('.')[0]
                            self.postgres_recorder.insertIntoDocTopTable(document_id, \
                               [topic], [category])
                            self._recordParagraphAndSentence(
                                document_id, doc_content,
                                self.postgres_recorder, topic, istrain)

        Logger.logr.info("Document reading complete.")
        return 1