Python DBController.getAllSentence示例

class DataProcessorThread(Thread):
	def __init__(self, taskQueue, resultQueue, *args):
		super(DataProcessorThread, self).__init__()
		self._taskQueue = taskQueue
		self._resultQueue = resultQueue
		self._args = args
		self._executeFunction = None

		self._db = DBController()
		self._citeWordList = getWordList(WORD_CITE)
		if not os.path.exists('export/'):
			os.makedirs('export/')

	def exportSentenceAnalysis(self):
		#sentence collection is all the sentence
		#deprecated, need to refactor and apply queue
		with open('export/sentence.csv', 'wb') as f:
			writer = csv.writer(f)
			sentences = self._db.getAllSentence()
			articleDict = {}
			attributeList = ['id', 'cotic', 'coname', 'filePath', 'accessionNo', 'content', 'coname','ceoname', 'cite',
			                 'co_c', 'ceo_c', 'analyst_c', 'pfm', 'pfm_words', 'pos', 'pos_words', 'neg', 'neg_words',
			                 'internal', 'int_words', 'external', 'ext_words',
			                 'quote_sen', 'analyst']
			writer.writerow(attributeList)
			for i, sentence in enumerate(sentences):
				try:
					print(i)
					if sentence['articleId'] not in articleDict:
						articleDict[sentence['articleId']] = self._db.getArticleById(sentence['articleId'])
					article = articleDict[sentence['articleId']]
					articlePathPartList = article['filePath'].split('/')
					articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
					articleCompany = self._db.getCompanyByCode(articleCompanyCode)
					articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
					sentenceCompanyList = [self._db.getCompanyById(companyId) for companyId in sentence['company']]
					sentenceCompanyNameString = ','.join([company['shortName'] for company in sentenceCompanyList])
					sentenceEngagerList = [self._db.getEngagerById(engagerId) for engagerId in sentence['engager']]
					CEOList = filter(lambda engager : engager['type'] == ENGAGER_CEO, sentenceEngagerList)
					analystList =  filter(lambda engager : engager['type'] == ENGAGER_ANALYST, sentenceEngagerList)
					CEONameString = ','.join([CEO['lastName'] for CEO in CEOList])
					citeWordString = ','.join(sentence['cite'])
					citeCompany, citeCEO, citeAnalyst = int(sentence['citeCompany']), int(sentence['citeCEO']), int(sentence['citeAnalyst'])
					pfmWordString = ','.join(sentence['pfm'])
					posWordString = ','.join(sentence['pos'])
					negWordString = ','.join(sentence['neg'])
					inWordString = ','.join(sentence['in'])
					exWordString = ','.join(sentence['ex'])
					quoteString = getQuotedString(sentence['content'])
					analystSurroundString = getStringSurroundWordInDistance(sentence['content'], 'analyst', ANALYST_SURROUND_DISTANCE)
					lineList = [sentence['_id'], articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], sentence['content'].encode('utf-8'),
					            sentenceCompanyNameString, CEONameString, citeWordString, citeCompany, citeCEO, citeAnalyst,
					            len(sentence['pfm']), pfmWordString, len(sentence['pos']), posWordString, len(sentence['neg']), negWordString,
					            len(sentence['in']), inWordString, len(sentence['ex']), exWordString,
					            quoteString, analystSurroundString]

					writer.writerow(lineList)
				except Exception as e:
					print(e)

	def exportArticleAnalysis(self):
		#deprecated
		with open('export/article.csv', 'wb') as f:
			writer = csv.writer(f)
			articleList = list(self._db.getAllArticle())
			attributeList = ['cotic', 'coname', 'filePath', 'accessNo', 'date', 'source', 'byline',
			                 'coname1', 'coname2', 'coname3', 'coname4', 'coname5',
			                 'subjectCode1', 'subjectCode2', 'subjectCode3', 'subjectCode4', 'subjectCode5']
			writer.writerow(attributeList)
			for i, article in enumerate(articleList):
				try:
					print(i)
					articlePathPartList = article['filePath'].split('/')
					articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
					articleCompany = self._db.getCompanyByCode(articleCompanyCode)
					articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
					companyCodeList = [''] * ARTICLE_EXPORT_CODE_SIZE
					subjectCodeList = [''] * ARTICLE_EXPORT_CODE_SIZE
					if 'company' in article:
						for i, companyCode in enumerate(article['company']):
							if i >= ARTICLE_EXPORT_CODE_SIZE:
								break
							companyCodeList[i] = companyCode
					else:
						article['company'] = [articleCompanyCode]
						companyCodeList = article['company']

					if 'newsSubject' in article:
						for i, subjectCode in enumerate(article['newsSubject']):
							if i >= ARTICLE_EXPORT_CODE_SIZE:
								break
							subjectCodeList[i] = subjectCode
					else:
						article['newsSubject'] = []
						subjectCodeList = article['newsSubject']

					self._db.saveArticle(article)

					lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'], article['byline']] + companyCodeList + subjectCodeList
					writer.writerow(lineList)
				except Exception as e:
					print(e)

	def processKeywordSearch(self):
		searchString = self._args[0]
		while True:
			article = self._taskQueue.get()
			if article == END_OF_QUEUE:
				break
			else:
				articlePathPartList = article['filePath'].split('/')
				articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
				articleCompany = self._db.getCompanyByCode(articleCompanyCode)
				articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
				articleSentenceList = []

				#here, use '|' to combine regex is OK, because sentence is short, will not reduce the performance that much.
				#But in DB search, use iterative way.
				pattern = getPatternByKeywordSearchString(searchString)

				#on sentence level first, if can't find, go to paragraph level.
				for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]:
					sentenceList = sent_tokenize(paragraph)
					for sentence in sentenceList:
						if re.search(pattern, sentence) is not None:
							articleSentenceList.append(sentence.encode('utf-8').strip())
				if not articleSentenceList:
					#search on paragraph level
					for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]:
						if re.search(pattern, paragraph) is not None:
							articleSentenceList.append(paragraph.encode('utf-8').strip())
				lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['headline'].strip(), '\t'.join(articleSentenceList)]
				self._resultQueue.put(lineList)

	def processCitationBlock(self):
		#because list is too long, we need to separate name in to chunk
		brokerNameList = list(self._db.getAllBrokerageEffectiveNameList())
		brokerageNamePatternList = []
		for i in range(0, len(brokerNameList), 500):
			brokerageNamePatternList.append(re.compile(r'|'.join([r'\b' + name + r'\b' for name in brokerNameList[i : i + 500]]), re.IGNORECASE))

		quotePattern = re.compile(r'\"[^\"]+\"')
		citeWordPatternStringList = [(r'\b' + citeWord + r'\b') for citeWord in self._citeWordList]

		companyCEODict = self._db.getAllCompanyCEODict()
		engagerNamePattern = re.compile(r'|'.join(['CEO', 'analyst', 'executive']), re.IGNORECASE)
		citeWordPattern = re.compile(r'|'.join(citeWordPatternStringList), re.IGNORECASE)

		wordMatchPatternList = [getWordRegexPattern(WORD_CAUSE_IN), getWordRegexPattern(WORD_CAUSE_EX), getWordRegexPattern(WORD_CONTROL_LOW), getWordRegexPattern(WORD_CONTROL_HIGH), getWordRegexPattern(MCD_POS), getWordRegexPattern(MCD_NEG), getWordRegexPattern(MCD_UNCERTAIN)]
		filterWordDict = getWordDict(WORD_FILTER)
		while True:
			#process in batch
			articleBatch = self._taskQueue.get()
			if articleBatch == END_OF_QUEUE:
				self._taskQueue.task_done()
				break
			else:
				lineListBatch = []
				toProcessSentenceBatch = []
				sentenceTextIndex, NERStartIndex, NERPartCount, wordMatchStartIndex = 9, 12, 5, 18
				#add byline_cleaned in articleDict
				self.processBylineInBatch(articleBatch)
				for article in articleBatch:
					self._db.setArticleProcessed(article['_id'])
					articlePathPartList = article['filePath'].split('/')
					articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
					articleCompany = self._db.getCompanyByCode(articleCompanyCode)
					articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
					articleLineListPart = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['byline_cleaned'], article['headline'].strip()]

					for paragraph in [article['leadParagraph'], article['tailParagraph']]:
						#if found qouted part in this paragraph
						quotedStringList = re.findall(quotePattern, paragraph)
						if quotedStringList and max([len(string.split()) for string in quotedStringList]) > 5:
							#Among all the quoted parts, the max word count MUST bigger than 5
							#If so, then get all sentences
							sentenceList = sent_tokenize(paragraph)
							for sentence in sentenceList:
								quotedStringList = re.findall(quotePattern, sentence)
								citeWordList = re.findall(citeWordPattern, sentence)
								#If this sentence has quotation and quoted part word cout is bigger than 5 and has cite word
								#Then parse it add to the export
								if quotedStringList and max([len(string.split()) for string in quotedStringList]) > 5 and citeWordList:
									lineList = articleLineListPart + [sentence, '. '.join(quotedStringList), ', '.join(citeWordList)] + [''] * NERPartCount + [len(sentence.split())] + [''] * len(wordMatchPatternList) * 2
									# Macth the keyword in dictionary
									for i, pattern in enumerate(wordMatchPatternList):
										matchedWordList = getMatchWordListFromPattern(sentence, pattern, filterWordDict)
										lineList[i + wordMatchStartIndex] = len(matchedWordList)
										lineList[i + len(wordMatchPatternList) + wordMatchStartIndex] = ', '.join(matchedWordList)
									lineListBatch.append(lineList)
									toProcessSentenceBatch.append(sentence)
				actorAndOrgListBatch = self.processCiteSentenceInBatch(toProcessSentenceBatch)
				for i, actorAndOrgList in enumerate(actorAndOrgListBatch):
					if actorAndOrgList is not None:
						engagerNameList = re.findall(engagerNamePattern, lineListBatch[i][sentenceTextIndex])
						FCEO = 0
						articleCompanyCode = lineListBatch[i][0]
						for name in actorAndOrgList[0].split(', '):
							for namePart in name.split():
								if articleCompanyCode in companyCEODict and companyCEODict[articleCompanyCode].find(namePart) != -1:
									FCEO = 1
						lineListBatch[i][NERStartIndex] = actorAndOrgList[0]
						lineListBatch[i][NERStartIndex + 1] = actorAndOrgList[1]
						lineListBatch[i][NERStartIndex + 2] = ' '.join(engagerNameList)
						lineListBatch[i][NERStartIndex + 3] = FCEO
						unQuotedPart = re.sub(r'"[^"]+"', '', lineListBatch[i][sentenceTextIndex])
						findBrokerage = False
						for pattern in brokerageNamePatternList:
							result = pattern.search(unQuotedPart)
							if result is not None and result.string[result.regs[0][0]].isupper():
								findBrokerage = True
								break
						lineListBatch[i][NERStartIndex + 4] = 1 if findBrokerage else 0
						self._resultQueue.put(lineListBatch[i])
				self._taskQueue.task_done()

	def getNERTaggedTupleListFromSentence(self, sentence):
		#use senna name entity tagger, it fast!!
		sentence = unicode(sentence).encode('utf-8', 'ignore')
		with open('temp/input.txt', 'w') as f:
			f.write(sentence)
		os.system('./senna/senna -path senna/ -ner <temp/input.txt> temp/output.txt')
		with open('temp/output.txt', 'r') as f:
			tagTupleList = [[word.strip().split('-')[-1] if i ==1  else word.strip() for i, word in enumerate(line.split())] for line in f.readlines() if line.split()]
		return tagTupleList

	def processBylineInBatch(self, articleBatch):
		#use '.' to replace '' of byline, because if the last sentence byline is '', it will not be add to concatenated string.
		tagTupleList = self.getNERTaggedTupleListFromSentence(' ****** '.join([article['byline'] if article['byline'] else 'null.' for article in articleBatch]))

		personList, lastTag, wordList  = [], '', []
		articleIndex = 0
		for i in range(len(tagTupleList)):
			if tagTupleList[i][1] != lastTag:
				if lastTag == 'PER':
					personList.append(' '.join(wordList))
				wordList = [tagTupleList[i][0]]
				lastTag = tagTupleList[i][1]
			else:
				wordList.append(tagTupleList[i][0])

			if tagTupleList[i][0].find('****') != -1 or i == len(tagTupleList) - 1:
				#end of one sentence
				articleBatch[articleIndex]['byline_cleaned'] = ', '.join(personList) if personList else ''
				personList, lastTag, wordList = [], '', []
				articleIndex = articleIndex + 1 if i != len(tagTupleList) - 1 else articleIndex
				if articleIndex >= len(articleBatch):
					return

		while articleIndex < len(articleBatch) :
			articleBatch[articleIndex]['byline_cleaned'] = ''
			articleIndex += 1


	def processCiteSentenceInBatch(self, sentenceBatch):
		tagTupleList = self.getNERTaggedTupleListFromSentence(' ****** '.join(sentenceBatch))

		personAndOrgListBatch = []
		personList, orgnizationList, inQuoteFlag, lastTag, wordList  = [], [], False, '', []
		for i in range(len(tagTupleList)):
			if tagTupleList[i][0] == '\"':
				inQuoteFlag = 1 - inQuoteFlag
				if not inQuoteFlag:
					del wordList[:]
			else:
				if not inQuoteFlag:
					if tagTupleList[i][1] != lastTag:
						if lastTag == 'PER':
							personList.append(' '.join(wordList))
						elif lastTag == 'ORG':
							orgnizationList.append(' '.join(wordList))
						wordList = [tagTupleList[i][0]]
						lastTag = tagTupleList[i][1]
					else:
						wordList.append(tagTupleList[i][0])

			if tagTupleList[i][0].find('****') != -1 or i == len(tagTupleList) - 1:
				#end of one sentence
				if not personList and not orgnizationList:
					personAndOrgListBatch.append(None)
				else:
					personAndOrgListBatch.append([', '.join(personList), ', '.join(orgnizationList)])
				personList, orgnizationList, inQuoteFlag, lastTag, wordList  = [], [], False, '', []

		return personAndOrgListBatch

	def run(self):
		self._executeFunction()

示例#2

显示文件

文件： SignifierParser.py 项目： exsonic/CorpusAnalysis

class SignifierParser(object):

	def __init__(self):
		self.db = DBController()
		self.pfmWord = getWordList(WORD_PFM)
		self.posWord = getWordList(WORD_POS)
		self.negWord = getWordList(WORD_NEG)
		self.exWord = getWordList(ATRB_EX)
		self.inWord = getWordList(ATRB_IN)
		self.citeWord = getWordList(WORD_CITE)
		self.engagerList = list(self.db.getAllEngager())
		self.companyList = list(self.db.getAllCompany())

		self.engagerRegexPatternDict, self.companyRegexPatternDict = self.getRegexPatternDictForEngagerAndCompany()

	def getRegexPatternDictForEngagerAndCompany(self):
		engagerRegexPatternDict, companyRegexPatternDict = {}, {}
		for engager in self.engagerList:
			if engager['lastName'] == 'Jones' or engager['lastName'] == 'Johnson' or engager['lastName'] == 'West' or engager['lastName'] == 'Post' or engager['lastName'] == 'Ford':
				searchName = engager['name']
			else:
				searchName = engager['lastName']
			engagerRegexPatternDict[engager['_id']] = re.compile(r'\b' + searchName + r'\b')

		for company in self.companyList:
			companyRegexPatternDict[company['_id']] = re.compile(r'\b' + company['shortName'] + r'\b', re.IGNORECASE)

		return engagerRegexPatternDict, companyRegexPatternDict

	def extractAllSentenceToDB(self, isReload=False):
		if isReload:
			self.db.dropSentence()
		# for company in self.companies:
		for i, company in enumerate(self.companyList):
			articles = list(self.db.getAllArticleByCompanyCode(company['code']))
			engagers = list(self.db.getAllEngagerByCompanyId(company['_id']))
			for j, article in enumerate(articles):
				print(i, j)
				paragraphSet = ('leadParagraph', 'tailParagraph')
				for key in paragraphSet:
					paragraph = article[key]
					sentenceList = sent_tokenize(paragraph)
					for string in sentenceList:
						if not isValidSentence(string):
							continue
						sentenceDict = {'content' : string.encode('utf-8'), 'articleId' : article['_id'], 'paragraph' : key}
						sentenceDict = self.parseRawSentence(sentenceDict, engagers)
						if sentenceDict is not None:
							self.db.insertSentence(sentenceDict)

	def parseRawSentence(self, sentenceDict, engagerList):
		engagerIdList, companyIdList = [], []
		for engager in engagerList:
			if self.engagerRegexPatternDict[engager['_id']].search(sentenceDict['content']) is not None:
				engagerIdList.append(engager['_id'])

		for company in self.companyList:
			if self.companyRegexPatternDict[company['_id']].search(sentenceDict['content']) is not None:
				companyIdList.append(company['_id'])

		if not engagerIdList and not companyIdList:
			return None
		else:
			sentenceDict['engager'] = list(set(engagerIdList))
			sentenceDict['company'] = list(set(companyIdList))
			return sentenceDict

	def parseAllSentenceCitation(self):
		sentences = list(self.db.getAllSentence())
		for i, sentence in enumerate(sentences):
			print(i)
			words = getProcessedWordList(sentence['content'], VERB)
			sentence['cite'] = filter(lambda  word : word in self.citeWord, words)
			sentence['citeCEO'], sentence['citeAnalyst'], sentence['citeCompany'] = self.isCiteInDistance(sentence)
			self.db.saveSentence(sentence)

	def parseAllSentencePfm(self):
		#list them all, becaue if loop with cursor and update cursor pointed sentence at meantime, the cursor will be screwed.
		sentences = list(self.db.getAllSentence())
		for i, sentence in enumerate(sentences):
			print(i)
			pfmSentenceWordList = getProcessedWordList(sentence['content'], NOUN)
			pfmWordList = filter(lambda word : word in self.pfmWord, pfmSentenceWordList)
			posNegSentenceWordList = getProcessedWordList(sentence['content'], VERB)
			posWordList = filter(lambda word : word in self.posWord, posNegSentenceWordList)
			negWordList = filter(lambda word : word in self.negWord, posNegSentenceWordList)

			posWordList, negWordList = self.filterPosNegWordListByDistance(pfmSentenceWordList, posNegSentenceWordList, pfmWordList, posWordList, negWordList)

			self.db.updateSentencePfm(sentence['_id'], pfmWordList, posWordList, negWordList)

	def parseAllSentenceAtrb(self):
		sentences = list(self.db.getAllSentence())
		for i, sentence in enumerate(sentences):
			print(i)
			words = getProcessedWordList(sentence['content'], NOUN)
			exWordList = filter(lambda word : word in self.exWord, words)
			inWordList = filter(lambda word : word in self.inWord, words)
			if ('ceo' in inWordList or 'executive' in inWordList) and sentence['cite']:
				inWordList = []
			self.db.updateSentenceAtrb(sentence['_id'], exWordList, inWordList)

	def isCiteInDistance(self, sentence):
		#if (CEO or Company) and citation word happens within 5 word distance, capture
		isCiteCEO, isCiteAnalyst, isCiteCompany = False, False, False
		if sentence['cite']:
			wordList = getProcessedWordList(sentence['content'], VERB)
			for citeWord in sentence['cite']:
				citeIndex = wordList.index(citeWord)
				for engagerId in sentence['engager']:
					try:
						engager = self.db.getEngagerById(engagerId)
						matchName = engager['lastName'].lower()
						engagerIndex = wordList.index(matchName)
						if abs(citeIndex - engagerIndex) <= CITE_DISTANCE:
							if engager['type'] == ENGAGER_CEO:
								isCiteCEO = True
							else:
								isCiteAnalyst = True
					except:
						pass

				for companyId in sentence['company']:
					try:
						company = self.db.getCompanyById(companyId)
						matchName = company['shortName'].lower()
						companyIndex = wordList.index(matchName)
						if abs(citeIndex - companyIndex) <= CITE_DISTANCE:
							isCiteCompany = True
					except:
						pass
		return isCiteCEO, isCiteAnalyst, isCiteCompany

	def filterPosNegWordListByDistance(self, pfmSentenceWordList, posNegSentenceWordList, pfmWordList, posWordList, negWordList):
		filteredPosWordList, filteredNegWordList = [],[]
		for pfmWord in pfmWordList:
			pfmIndex = pfmSentenceWordList.index(pfmWord)
			for posWord in posWordList:
				posIndex = posNegSentenceWordList.index(posWord)
				if abs(pfmIndex - posIndex) <= PFM_DISTANCE:
					filteredPosWordList.append(posWord)
			for negWord in negWordList:
				negIndex = posNegSentenceWordList.index(negWord)
				if abs(pfmIndex - negIndex) <= PFM_DISTANCE:
					filteredNegWordList.append(negWord)
		return filteredPosWordList, filteredNegWordList