Exemplo n.º 1
0
def CreateSupplementalSentenceStructures(supp_file_path):
	"""
	Create SentenceStructures from supplemental documents
	
	:param supp_file_path: Path to directory where supplemental documents are located
	:return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension
	"""

	#Create a dictionary of documents
	docDictionary = {}

	# cd into test file directory
	cwd = os.getcwd()
	os.chdir(supp_file_path)

	#Iterate over documents in the supp_file_path directory
	for document in os.listdir():

		#Instantiate a list to hold a SentenceStructure for each sentence(line) in the document
		docSentenceStructureList = []

		#Open the document
		doc = open(document, "r")
		
		docText = doc.read()
		docTextProcessed = preprocess(docText)
		docTextProcessedSplit = docTextProcessed.splitlines()
		
		doc.close()
		
		doc = open(document, "r")
		
		#Strip the extension from the file to get the document name
		docName = os.path.splitext(document)[0]

		#Iterate over sentences in the document
		counter = 0
		for sentence in doc.readlines():
			#Create a SentenceStructure obj
			ss = SentenceStructure(sentence, docName)
			ss.modifiedSentence = docTextProcessedSplit[counter]

			#Add SentenceStructure obj to the list
			docSentenceStructureList.append(ss)      
			counter += 1

		#Add the SentenceStructureList to the dictionary
		docDictionary[docName] = docSentenceStructureList

		#Close the document
		doc.close()
		
	#Return to original path
	os.chdir(cwd)
	
	#Return the dictionary
	return docDictionary
Exemplo n.º 2
0
def CreateSentenceStructures(raw_file_path):
	"""
	Create SentenceStructures from raw documents
	
	:param raw_file_path: Path to directory where raw documents are located
	:return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension
	"""

	#Create a dictionary of documents
	docDictionary = {}

	# cd into test file directory
	cwd = os.getcwd()
	os.chdir(raw_file_path)

	#Iterate over documents in the raw_file_path directory
	for document in os.listdir():

		#Instantiate a list to hold a SentenceStructure for each sentence(line) in the document
		docSentenceStructureList = []

		#Open the document
		doc = open(document, "r")
		
		docText = doc.read()
		docTextProcessed = preprocess(docText)
		docTextProcessedSplit = docTextProcessed.splitlines()
		
		doc.close()
		
		doc = open(document, "r")
		try:
			#Iterate over sentences in the document
			counter = 0
			for sentence in doc.readlines():
				#Create a SentenceStructure obj
				ss = SentenceStructure(sentence)
				ss.modifiedSentence = docTextProcessedSplit[counter]

				#Add SentenceStructure obj to the list
				docSentenceStructureList.append(ss)        
				
				counter += 1
		except:
			print("ERR. " + str(document))
			sys.exit(0)
			
		assert(len(docSentenceStructureList) == len(docTextProcessedSplit)), "Assertion Failed, array lengths don't match. " + str(len(docSentenceStructureList)) + " " + str(len(docTextProcessedSplit))

		#Strip the extension from the file to get the document name
		docName = os.path.splitext(document)[0]

		#Add the SentenceStructureList to the dictionary
		docDictionary[docName] = docSentenceStructureList

		#Close the document
		doc.close()
		
	#Return to original path
	os.chdir(cwd)
	
	#Return the dictionary
	return docDictionary