class DataProcessorThread(Thread): def __init__(self, taskQueue, resultQueue, *args): super(DataProcessorThread, self).__init__() self._taskQueue = taskQueue self._resultQueue = resultQueue self._args = args self._executeFunction = None self._db = DBController() self._citeWordList = getWordList(WORD_CITE) if not os.path.exists('export/'): os.makedirs('export/') def exportSentenceAnalysis(self): #sentence collection is all the sentence #deprecated, need to refactor and apply queue with open('export/sentence.csv', 'wb') as f: writer = csv.writer(f) sentences = self._db.getAllSentence() articleDict = {} attributeList = ['id', 'cotic', 'coname', 'filePath', 'accessionNo', 'content', 'coname','ceoname', 'cite', 'co_c', 'ceo_c', 'analyst_c', 'pfm', 'pfm_words', 'pos', 'pos_words', 'neg', 'neg_words', 'internal', 'int_words', 'external', 'ext_words', 'quote_sen', 'analyst'] writer.writerow(attributeList) for i, sentence in enumerate(sentences): try: print(i) if sentence['articleId'] not in articleDict: articleDict[sentence['articleId']] = self._db.getArticleById(sentence['articleId']) article = articleDict[sentence['articleId']] articlePathPartList = article['filePath'].split('/') articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2] articleCompany = self._db.getCompanyByCode(articleCompanyCode) articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name'] sentenceCompanyList = [self._db.getCompanyById(companyId) for companyId in sentence['company']] sentenceCompanyNameString = ','.join([company['shortName'] for company in sentenceCompanyList]) sentenceEngagerList = [self._db.getEngagerById(engagerId) for engagerId in sentence['engager']] CEOList = filter(lambda engager : engager['type'] == ENGAGER_CEO, sentenceEngagerList) analystList = filter(lambda engager : engager['type'] == ENGAGER_ANALYST, sentenceEngagerList) CEONameString = ','.join([CEO['lastName'] for CEO in CEOList]) citeWordString = ','.join(sentence['cite']) citeCompany, citeCEO, citeAnalyst = int(sentence['citeCompany']), int(sentence['citeCEO']), int(sentence['citeAnalyst']) pfmWordString = ','.join(sentence['pfm']) posWordString = ','.join(sentence['pos']) negWordString = ','.join(sentence['neg']) inWordString = ','.join(sentence['in']) exWordString = ','.join(sentence['ex']) quoteString = getQuotedString(sentence['content']) analystSurroundString = getStringSurroundWordInDistance(sentence['content'], 'analyst', ANALYST_SURROUND_DISTANCE) lineList = [sentence['_id'], articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], sentence['content'].encode('utf-8'), sentenceCompanyNameString, CEONameString, citeWordString, citeCompany, citeCEO, citeAnalyst, len(sentence['pfm']), pfmWordString, len(sentence['pos']), posWordString, len(sentence['neg']), negWordString, len(sentence['in']), inWordString, len(sentence['ex']), exWordString, quoteString, analystSurroundString] writer.writerow(lineList) except Exception as e: print(e) def exportArticleAnalysis(self): #deprecated with open('export/article.csv', 'wb') as f: writer = csv.writer(f) articleList = list(self._db.getAllArticle()) attributeList = ['cotic', 'coname', 'filePath', 'accessNo', 'date', 'source', 'byline', 'coname1', 'coname2', 'coname3', 'coname4', 'coname5', 'subjectCode1', 'subjectCode2', 'subjectCode3', 'subjectCode4', 'subjectCode5'] writer.writerow(attributeList) for i, article in enumerate(articleList): try: print(i) articlePathPartList = article['filePath'].split('/') articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2] articleCompany = self._db.getCompanyByCode(articleCompanyCode) articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name'] companyCodeList = [''] * ARTICLE_EXPORT_CODE_SIZE subjectCodeList = [''] * ARTICLE_EXPORT_CODE_SIZE if 'company' in article: for i, companyCode in enumerate(article['company']): if i >= ARTICLE_EXPORT_CODE_SIZE: break companyCodeList[i] = companyCode else: article['company'] = [articleCompanyCode] companyCodeList = article['company'] if 'newsSubject' in article: for i, subjectCode in enumerate(article['newsSubject']): if i >= ARTICLE_EXPORT_CODE_SIZE: break subjectCodeList[i] = subjectCode else: article['newsSubject'] = [] subjectCodeList = article['newsSubject'] self._db.saveArticle(article) lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'], article['byline']] + companyCodeList + subjectCodeList writer.writerow(lineList) except Exception as e: print(e) def processKeywordSearch(self): searchString = self._args[0] while True: article = self._taskQueue.get() if article == END_OF_QUEUE: break else: articlePathPartList = article['filePath'].split('/') articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2] articleCompany = self._db.getCompanyByCode(articleCompanyCode) articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name'] articleSentenceList = [] #here, use '|' to combine regex is OK, because sentence is short, will not reduce the performance that much. #But in DB search, use iterative way. pattern = getPatternByKeywordSearchString(searchString) #on sentence level first, if can't find, go to paragraph level. for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]: sentenceList = sent_tokenize(paragraph) for sentence in sentenceList: if re.search(pattern, sentence) is not None: articleSentenceList.append(sentence.encode('utf-8').strip()) if not articleSentenceList: #search on paragraph level for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]: if re.search(pattern, paragraph) is not None: articleSentenceList.append(paragraph.encode('utf-8').strip()) lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['headline'].strip(), '\t'.join(articleSentenceList)] self._resultQueue.put(lineList) def processCitationBlock(self): #because list is too long, we need to separate name in to chunk brokerNameList = list(self._db.getAllBrokerageEffectiveNameList()) brokerageNamePatternList = [] for i in range(0, len(brokerNameList), 500): brokerageNamePatternList.append(re.compile(r'|'.join([r'\b' + name + r'\b' for name in brokerNameList[i : i + 500]]), re.IGNORECASE)) quotePattern = re.compile(r'\"[^\"]+\"') citeWordPatternStringList = [(r'\b' + citeWord + r'\b') for citeWord in self._citeWordList] companyCEODict = self._db.getAllCompanyCEODict() engagerNamePattern = re.compile(r'|'.join(['CEO', 'analyst', 'executive']), re.IGNORECASE) citeWordPattern = re.compile(r'|'.join(citeWordPatternStringList), re.IGNORECASE) wordMatchPatternList = [getWordRegexPattern(WORD_CAUSE_IN), getWordRegexPattern(WORD_CAUSE_EX), getWordRegexPattern(WORD_CONTROL_LOW), getWordRegexPattern(WORD_CONTROL_HIGH), getWordRegexPattern(MCD_POS), getWordRegexPattern(MCD_NEG), getWordRegexPattern(MCD_UNCERTAIN)] filterWordDict = getWordDict(WORD_FILTER) while True: #process in batch articleBatch = self._taskQueue.get() if articleBatch == END_OF_QUEUE: self._taskQueue.task_done() break else: lineListBatch = [] toProcessSentenceBatch = [] sentenceTextIndex, NERStartIndex, NERPartCount, wordMatchStartIndex = 9, 12, 5, 18 #add byline_cleaned in articleDict self.processBylineInBatch(articleBatch) for article in articleBatch: self._db.setArticleProcessed(article['_id']) articlePathPartList = article['filePath'].split('/') articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2] articleCompany = self._db.getCompanyByCode(articleCompanyCode) articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name'] articleLineListPart = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['byline_cleaned'], article['headline'].strip()] for paragraph in [article['leadParagraph'], article['tailParagraph']]: #if found qouted part in this paragraph quotedStringList = re.findall(quotePattern, paragraph) if quotedStringList and max([len(string.split()) for string in quotedStringList]) > 5: #Among all the quoted parts, the max word count MUST bigger than 5 #If so, then get all sentences sentenceList = sent_tokenize(paragraph) for sentence in sentenceList: quotedStringList = re.findall(quotePattern, sentence) citeWordList = re.findall(citeWordPattern, sentence) #If this sentence has quotation and quoted part word cout is bigger than 5 and has cite word #Then parse it add to the export if quotedStringList and max([len(string.split()) for string in quotedStringList]) > 5 and citeWordList: lineList = articleLineListPart + [sentence, '. '.join(quotedStringList), ', '.join(citeWordList)] + [''] * NERPartCount + [len(sentence.split())] + [''] * len(wordMatchPatternList) * 2 # Macth the keyword in dictionary for i, pattern in enumerate(wordMatchPatternList): matchedWordList = getMatchWordListFromPattern(sentence, pattern, filterWordDict) lineList[i + wordMatchStartIndex] = len(matchedWordList) lineList[i + len(wordMatchPatternList) + wordMatchStartIndex] = ', '.join(matchedWordList) lineListBatch.append(lineList) toProcessSentenceBatch.append(sentence) actorAndOrgListBatch = self.processCiteSentenceInBatch(toProcessSentenceBatch) for i, actorAndOrgList in enumerate(actorAndOrgListBatch): if actorAndOrgList is not None: engagerNameList = re.findall(engagerNamePattern, lineListBatch[i][sentenceTextIndex]) FCEO = 0 articleCompanyCode = lineListBatch[i][0] for name in actorAndOrgList[0].split(', '): for namePart in name.split(): if articleCompanyCode in companyCEODict and companyCEODict[articleCompanyCode].find(namePart) != -1: FCEO = 1 lineListBatch[i][NERStartIndex] = actorAndOrgList[0] lineListBatch[i][NERStartIndex + 1] = actorAndOrgList[1] lineListBatch[i][NERStartIndex + 2] = ' '.join(engagerNameList) lineListBatch[i][NERStartIndex + 3] = FCEO unQuotedPart = re.sub(r'"[^"]+"', '', lineListBatch[i][sentenceTextIndex]) findBrokerage = False for pattern in brokerageNamePatternList: result = pattern.search(unQuotedPart) if result is not None and result.string[result.regs[0][0]].isupper(): findBrokerage = True break lineListBatch[i][NERStartIndex + 4] = 1 if findBrokerage else 0 self._resultQueue.put(lineListBatch[i]) self._taskQueue.task_done() def getNERTaggedTupleListFromSentence(self, sentence): #use senna name entity tagger, it fast!! sentence = unicode(sentence).encode('utf-8', 'ignore') with open('temp/input.txt', 'w') as f: f.write(sentence) os.system('./senna/senna -path senna/ -ner <temp/input.txt> temp/output.txt') with open('temp/output.txt', 'r') as f: tagTupleList = [[word.strip().split('-')[-1] if i ==1 else word.strip() for i, word in enumerate(line.split())] for line in f.readlines() if line.split()] return tagTupleList def processBylineInBatch(self, articleBatch): #use '.' to replace '' of byline, because if the last sentence byline is '', it will not be add to concatenated string. tagTupleList = self.getNERTaggedTupleListFromSentence(' ****** '.join([article['byline'] if article['byline'] else 'null.' for article in articleBatch])) personList, lastTag, wordList = [], '', [] articleIndex = 0 for i in range(len(tagTupleList)): if tagTupleList[i][1] != lastTag: if lastTag == 'PER': personList.append(' '.join(wordList)) wordList = [tagTupleList[i][0]] lastTag = tagTupleList[i][1] else: wordList.append(tagTupleList[i][0]) if tagTupleList[i][0].find('****') != -1 or i == len(tagTupleList) - 1: #end of one sentence articleBatch[articleIndex]['byline_cleaned'] = ', '.join(personList) if personList else '' personList, lastTag, wordList = [], '', [] articleIndex = articleIndex + 1 if i != len(tagTupleList) - 1 else articleIndex if articleIndex >= len(articleBatch): return while articleIndex < len(articleBatch) : articleBatch[articleIndex]['byline_cleaned'] = '' articleIndex += 1 def processCiteSentenceInBatch(self, sentenceBatch): tagTupleList = self.getNERTaggedTupleListFromSentence(' ****** '.join(sentenceBatch)) personAndOrgListBatch = [] personList, orgnizationList, inQuoteFlag, lastTag, wordList = [], [], False, '', [] for i in range(len(tagTupleList)): if tagTupleList[i][0] == '\"': inQuoteFlag = 1 - inQuoteFlag if not inQuoteFlag: del wordList[:] else: if not inQuoteFlag: if tagTupleList[i][1] != lastTag: if lastTag == 'PER': personList.append(' '.join(wordList)) elif lastTag == 'ORG': orgnizationList.append(' '.join(wordList)) wordList = [tagTupleList[i][0]] lastTag = tagTupleList[i][1] else: wordList.append(tagTupleList[i][0]) if tagTupleList[i][0].find('****') != -1 or i == len(tagTupleList) - 1: #end of one sentence if not personList and not orgnizationList: personAndOrgListBatch.append(None) else: personAndOrgListBatch.append([', '.join(personList), ', '.join(orgnizationList)]) personList, orgnizationList, inQuoteFlag, lastTag, wordList = [], [], False, '', [] return personAndOrgListBatch def run(self): self._executeFunction()
class SignifierParser(object): def __init__(self): self.db = DBController() self.pfmWord = getWordList(WORD_PFM) self.posWord = getWordList(WORD_POS) self.negWord = getWordList(WORD_NEG) self.exWord = getWordList(ATRB_EX) self.inWord = getWordList(ATRB_IN) self.citeWord = getWordList(WORD_CITE) self.engagerList = list(self.db.getAllEngager()) self.companyList = list(self.db.getAllCompany()) self.engagerRegexPatternDict, self.companyRegexPatternDict = self.getRegexPatternDictForEngagerAndCompany() def getRegexPatternDictForEngagerAndCompany(self): engagerRegexPatternDict, companyRegexPatternDict = {}, {} for engager in self.engagerList: if engager['lastName'] == 'Jones' or engager['lastName'] == 'Johnson' or engager['lastName'] == 'West' or engager['lastName'] == 'Post' or engager['lastName'] == 'Ford': searchName = engager['name'] else: searchName = engager['lastName'] engagerRegexPatternDict[engager['_id']] = re.compile(r'\b' + searchName + r'\b') for company in self.companyList: companyRegexPatternDict[company['_id']] = re.compile(r'\b' + company['shortName'] + r'\b', re.IGNORECASE) return engagerRegexPatternDict, companyRegexPatternDict def extractAllSentenceToDB(self, isReload=False): if isReload: self.db.dropSentence() # for company in self.companies: for i, company in enumerate(self.companyList): articles = list(self.db.getAllArticleByCompanyCode(company['code'])) engagers = list(self.db.getAllEngagerByCompanyId(company['_id'])) for j, article in enumerate(articles): print(i, j) paragraphSet = ('leadParagraph', 'tailParagraph') for key in paragraphSet: paragraph = article[key] sentenceList = sent_tokenize(paragraph) for string in sentenceList: if not isValidSentence(string): continue sentenceDict = {'content' : string.encode('utf-8'), 'articleId' : article['_id'], 'paragraph' : key} sentenceDict = self.parseRawSentence(sentenceDict, engagers) if sentenceDict is not None: self.db.insertSentence(sentenceDict) def parseRawSentence(self, sentenceDict, engagerList): engagerIdList, companyIdList = [], [] for engager in engagerList: if self.engagerRegexPatternDict[engager['_id']].search(sentenceDict['content']) is not None: engagerIdList.append(engager['_id']) for company in self.companyList: if self.companyRegexPatternDict[company['_id']].search(sentenceDict['content']) is not None: companyIdList.append(company['_id']) if not engagerIdList and not companyIdList: return None else: sentenceDict['engager'] = list(set(engagerIdList)) sentenceDict['company'] = list(set(companyIdList)) return sentenceDict def parseAllSentenceCitation(self): sentences = list(self.db.getAllSentence()) for i, sentence in enumerate(sentences): print(i) words = getProcessedWordList(sentence['content'], VERB) sentence['cite'] = filter(lambda word : word in self.citeWord, words) sentence['citeCEO'], sentence['citeAnalyst'], sentence['citeCompany'] = self.isCiteInDistance(sentence) self.db.saveSentence(sentence) def parseAllSentencePfm(self): #list them all, becaue if loop with cursor and update cursor pointed sentence at meantime, the cursor will be screwed. sentences = list(self.db.getAllSentence()) for i, sentence in enumerate(sentences): print(i) pfmSentenceWordList = getProcessedWordList(sentence['content'], NOUN) pfmWordList = filter(lambda word : word in self.pfmWord, pfmSentenceWordList) posNegSentenceWordList = getProcessedWordList(sentence['content'], VERB) posWordList = filter(lambda word : word in self.posWord, posNegSentenceWordList) negWordList = filter(lambda word : word in self.negWord, posNegSentenceWordList) posWordList, negWordList = self.filterPosNegWordListByDistance(pfmSentenceWordList, posNegSentenceWordList, pfmWordList, posWordList, negWordList) self.db.updateSentencePfm(sentence['_id'], pfmWordList, posWordList, negWordList) def parseAllSentenceAtrb(self): sentences = list(self.db.getAllSentence()) for i, sentence in enumerate(sentences): print(i) words = getProcessedWordList(sentence['content'], NOUN) exWordList = filter(lambda word : word in self.exWord, words) inWordList = filter(lambda word : word in self.inWord, words) if ('ceo' in inWordList or 'executive' in inWordList) and sentence['cite']: inWordList = [] self.db.updateSentenceAtrb(sentence['_id'], exWordList, inWordList) def isCiteInDistance(self, sentence): #if (CEO or Company) and citation word happens within 5 word distance, capture isCiteCEO, isCiteAnalyst, isCiteCompany = False, False, False if sentence['cite']: wordList = getProcessedWordList(sentence['content'], VERB) for citeWord in sentence['cite']: citeIndex = wordList.index(citeWord) for engagerId in sentence['engager']: try: engager = self.db.getEngagerById(engagerId) matchName = engager['lastName'].lower() engagerIndex = wordList.index(matchName) if abs(citeIndex - engagerIndex) <= CITE_DISTANCE: if engager['type'] == ENGAGER_CEO: isCiteCEO = True else: isCiteAnalyst = True except: pass for companyId in sentence['company']: try: company = self.db.getCompanyById(companyId) matchName = company['shortName'].lower() companyIndex = wordList.index(matchName) if abs(citeIndex - companyIndex) <= CITE_DISTANCE: isCiteCompany = True except: pass return isCiteCEO, isCiteAnalyst, isCiteCompany def filterPosNegWordListByDistance(self, pfmSentenceWordList, posNegSentenceWordList, pfmWordList, posWordList, negWordList): filteredPosWordList, filteredNegWordList = [],[] for pfmWord in pfmWordList: pfmIndex = pfmSentenceWordList.index(pfmWord) for posWord in posWordList: posIndex = posNegSentenceWordList.index(posWord) if abs(pfmIndex - posIndex) <= PFM_DISTANCE: filteredPosWordList.append(posWord) for negWord in negWordList: negIndex = posNegSentenceWordList.index(negWord) if abs(pfmIndex - negIndex) <= PFM_DISTANCE: filteredNegWordList.append(negWord) return filteredPosWordList, filteredNegWordList