def search_date(self,
                    sen_id,
                    sen,
                    sen_lemma,
                    sen_ner,
                    keyword=None,
                    dirc=None):
        date_candidates = [date for date in search_tag(sen, 'DATE', sen_ner)]
        if 'morning' in ' '.join(sen_lemma):
            date_candidates.append('morning')
        elif 'afternoon' in ' '.join(sen_lemma):
            date_candidates.append('afternoon')
        if keyword != None:
            if dirc == None:
                self.logger.debug(
                    'Error:\t |ipo_detect\tsearch_date|\tNeed search dirction for search keyword {}\n'
                    .format(keyword))
            elif dirc > 0:
                for date in date_candidates:
                    if ' '.join(sen_lemma).index(keyword) > ' '.join(
                            sen_lemma).index(date):
                        date_candidates.remove(date)
            else:
                for date in date_candidates:
                    if ' '.join(sen_lemma).index(keyword) < ' '.join(
                            sen_lemma).index(date):
                        date_candidates.remove(date)

        return date_candidates
    def search_trade_info(self, sen_id, sen, sen_lemma, sen_ner, sen_entity):
        comp_ipo_score, ipo_tag = 0, None
        if 'trad' in sp.join(sen_lemma):
            # eleminate the intention sentence
            if re.compile("^.*(will|could|would|to)~\^~(\w{,7}~\^~)*trade"
                          ).match(sp.join(sen_lemma)) != None:
                return 1, 'Upcoming'
            if re.compile(
                    '.*(open|begin|start)~\^~(.*~\^~){,1}(trade|trading)~\^~(.*~\^~){,1}(on|at).*'
            ).match(sp.join(sen_lemma)) != None:
                comp_candidate = [
                    comp for comp in search_tag(sen, 'ORGANIZATION', sen_ner)
                    if comp not in financial_terms
                ]
                if len(comp_candidate) > 0:
                    for comp in comp_candidate:
                        if match_substring(comp, self.topic_company):
                            comp_ipo_score += 1
                            ipo_tag = 'Trade'
                            print 'Find IPO keyword | {} | in sentence{} for target company {}:\tget 1 score for IPO'.format(
                                'Begin Trade', sen_id, self.topic_company)
                            break
                elif self.topic_company in ' '.join(sen_entity):
                    comp_ipo_score += 1
                    ipo_tag = 'Trade'
                    print 'Entity linked:'
                    print 'Find IPO keyword | {} | in sentence{} for target company {}:\tget 1 score for IPO'.format(
                        'Begin Trade', sen_id, self.topic_company)

        return comp_ipo_score, ipo_tag
 def match_comp_ticker(self, sen_id, sen, sen_lemma, sen_ner, sen_entity):
     comp_ipo_score = 0
     for stock_code in IPO_Search.stock_codes:
         if stock_code in ' '.join(sen):
             searched_ticker = self.search_ticker(sen_id, sen, sen_ner,
                                                  stock_code)
             if searched_ticker == '':
                 continue
             elif searched_ticker.startswith(sp):
                 entities = [
                     comp for comp in search_tag(sen, 'O', sen_entity)
                 ]
                 if len(entities) > 0:
                     searched_ticker = entities[0] + searched_ticker
                 elif self.topic_company in ' '.join(sen):
                     searched_ticker = self.topic_company + searched_ticker
                 else:
                     self.logger.debug(
                         'Error:\t|ipo_detect\tmatch_comp_ticker|\tNo match company for {}. \t|{}\n'
                         .format(searched_ticker, sen_id))
                     searched_ticker = 'None' + searched_ticker
             com_name = searched_ticker.split(sp)[0]
             if match_substring(com_name, self.topic_company):
                 (intention,
                  confid) = self.search_intention(sen_id, sen, sen_lemma)
                 if intention == 'IPO':
                     print 'Find ticker | {} | in sentence{} for target company {}:\tget 1 score for IPO.'.format(
                         searched_ticker, sen_id, self.topic_company)
                     comp_ipo_score += 1
                 else:
                     searched_ticker += '*'
             yield (searched_ticker, comp_ipo_score)
    def entity_coref_rsl(self):
        # predefined pronouns and definite articles
        pds = ['it', 'its', 'the company', 'the business', 'the firm']
        # last_entity/last_entity_index are used to record the org entity in the subject part of prior sentence
        last_entity, last_entity_index = None, None
        # Iterate each sentence
        for j in xrange(0, len(self.words)):
            self.entity_coref[j] = ['O'] * len(self.words[j])
            # Load NLP info for each sentence and
            sen_id, sen, sen_lemma = self.sen_ids[j], self.words[
                j], self.lemmas[j]
            sen_ner, sen_pos, sen_tree_str = self.ners[j], self.pos_tags[
                j], self.parse_trees[j]
            sen_depend = sen_depen(self.dependencies[j],
                                   len(sen),
                                   reverse=True)
            # self.logger.info('Sentence id: {} \n'.format(sen_id))
            # find all candidates companies other than special entities like journal, stock exchange institutions
            comps = [(comp_index, comp) for (
                comp_index,
                comp) in search_tag(sen, 'ORGANIZATION', sen_ner, index=True)
                     if comp not in CR.special_terms]
            # using dependency feature to find any definite articles
            # like the XXX company and add them to the preps list
            for index in search_word('the', sen):
                (dep_index, tag) = sen_depend[index][0]
                if sen[dep_index] in ['company', 'business', 'firm']:
                    new_pd = ' '.join(sen[index:dep_index + 1])
                    if new_pd not in pds:
                        pds.append(new_pd)
            for pd in pds:
                pd_indices = [index for index in search_word(pd, sen)]
                for pd_index in pd_indices:
                    if pd == 'it':
                        # Skip sentence such as 'it's + adj, it's + noun.'
                        (dep_index, tag) = sen_depend[pd_index][0]
                        if sen[dep_index +
                               1] in ["to", "for", "that", "about", "because"]:
                            if sen_lemma[dep_index] == 'be' or \
                                ('be' in sen_lemma[pd_index:dep_index] and
                                     (sen_pos[dep_index] == 'JJ' or sen_pos[dep_index].startswith('NN'))
                                 ):
                                # self.logger.info('"it" is not recognized as pronoun, because of it + adj or noun.\n')
                                continue
                        if ',' in sen[pd_index:]:
                            sen_tree, sen_tree_index, sen_structure = tree_position(
                                sen_tree_str)
                            if 'SBAR' in sen_structure[pd_index]:
                                # self.logger.debug("Should find entity after pronoun/definite article {}.Sen_id:\t{}\n".
                                #                    format(pd, sen_id))
                                pass
                    # Resolve the situation when referent is supposed to locate after pronoun
                    # that is when there is ':' or '--' followed by organization entity right after the pronoun
                    flag = False
                    dash_count = 0
                    if '--' in sen[pd_index:pd_index +
                                   3] or ':' in sen[pd_index:pd_index + 3]:
                        i = pd_index + 1
                        while i < len(sen) - 1:
                            if (sen[i] == ':' or (sen[i] == '--' and dash_count%2 == 0)) \
                                    and sen_ner[i+1] == 'ORGANIZATION':
                                start = i + 1
                                while i < len(
                                        sen) and sen_ner[i] == 'ORGANIZATION':
                                    i += 1
                                referent = '{}@{}@{}'.format(j, start, i)
                                update_coref_entity(
                                    pd_index, pd_index + len(pd.split(' ')),
                                    self.entity_coref[j], referent)
                                flag = True
                                if sen[i] == '--':
                                    dash_count += 1
                            i += 1
                    # if any situation above is triggered, skip this pronoun for prior organization detection
                    if flag:
                        # self.logger.info('Referent is assumed to locate after pronoun, because of ":" or "--".\n')
                        pass
                    # Code below is used to prior organization entity detection and linked with pronoun
                    (np_start, np_end,
                     verb_index) = self.search_dominated_np_for_pd(
                         sen_tree_str, pd_index)
                    if np_start is None:
                        pass
                        # self.logger.debug('Cannot Find directed dominated NP for pronoun {}, and assign subject entity \
                        # --{}-- of the prior sentence to this pronoun.\n'.format(pd, last_entity))
                    else:
                        # Find any org entity located in the range (np_start, np_end)
                        # in sentence or its co-reference array
                        sub_entities = [
                            (comp_index, comp) for (comp_index, comp) in comps
                            if comp_index <= np_end and np_start <= comp_index
                        ]
                        sub_entities += [
                            (comp_index, comp)
                            for (comp_index, comp_end,
                                 comp) in search_tag(self.entity_coref[j],
                                                     'O',
                                                     self.entity_coref[j],
                                                     index=True)
                            if comp_index <= np_end and np_start <= comp_index
                        ]
                        # If any, get the last one, which is assumed to be nearest one to the pronoun
                        if len(sub_entities) > 0:
                            (referent_index, refer_entity) = sub_entities[-1]
                            # if '@' in refer_entity means the refer_entity is from the co-reference array,
                            # just assign its value to the referent
                            if '@' in refer_entity:
                                referent = refer_entity
                            else:
                                # entity_index format is sen_id@start_index@end_index
                                referent = '{}@{}@{}'.format(
                                    j, referent_index, referent_index +
                                    len(refer_entity.split(' ')))
                            # update referent in the co-reference array
                            update_coref_entity(pd_index,
                                                pd_index + len(pd.split(' ')),
                                                self.entity_coref[j], referent)
                            continue
                        else:
                            pass
                            # self.logger.debug('Cannot find entity for pronoun/definite article {} in dominated NP part,\
                            #  and assign subject entity --{}-- of the prior sentence to this pronoun.\n'.format(pd, last_entity))
                    # if last entity is None,
                    if last_entity is None:
                        # if the sentence is the first sentence, use topic company instead
                        if j == 0:
                            last_entity = self.topic_company
                            last_entity_index = self.topic_company + '*'
                            # self.logger.info('Last entity is None and update it with topic company (1st sentence)!\n')
                        else:
                            pre_comps = [
                                (comp_index, comp)
                                for (comp_index,
                                     comp) in search_tag(self.words[j - 1],
                                                         'ORGANIZATION',
                                                         self.ners[j - 1],
                                                         index=True)
                                if comp not in CR.special_terms
                            ]
                            if len(pre_comps) > 0:
                                comp_index, last_entity = pre_comps[0]
                                last_entity_index = '{}@{}@{}'.format(
                                    j - 1, comp_index,
                                    comp_index + len(last_entity.split(' ')))
                                # self.logger.info('Last entity is None and update last entity with organization \
                                # appeared in prior sentence: {}\n'.format(last_entity))
                            else:
                                # self.logger.info('Last entity is None and update it with topic company!\n')
                                last_entity = self.topic_company
                                last_entity_index = self.topic_company + '*'
                    # update referent in the co-reference array
                    update_coref_entity(pd_index,
                                        pd_index + len(pd.split(' ')),
                                        self.entity_coref[j],
                                        last_entity_index)
            # Identify new organization entity in subject of the sentence and
            # update it to be the last entity
            (new_entity_index,
             new_entity) = self.search_subject_entity(sen_id, sen,
                                                      sen_tree_str, sen_ner,
                                                      comps)
            if new_entity:
                last_entity = new_entity
                last_entity_index = '{}@{}@{}'.format(
                    sen_id.split('@')[1], new_entity_index,
                    new_entity_index + len(new_entity.split(' ')))
                # self.logger.info('UPDATE LAST ENTITY to be {}\n'.format(sen_id, new_entity))

        return self.entity_coref
 def search_raise_fund(self,
                       sen_id,
                       sen,
                       sen_lemma,
                       sen_ner,
                       sen_entity,
                       sen_ipo_tags=None):
     comps = [
         comp for comp in search_tag(sen, 'ORGANIZATION', sen_ner)
         if comp not in financial_terms
     ]
     entities = [comp for comp in search_tag(sen, 'O', sen_entity)]
     if 'MONEY' in sen_ner:
         for (index, money) in search_tag(sen, 'MONEY', sen_ner,
                                          index=True):
             if money.endswith('illion'):
                 confi = 0.2
                 comp_candidate = self.search_comp_with_flag(sen,
                                                             sen_lemma,
                                                             sen_entity,
                                                             comps,
                                                             entities,
                                                             money,
                                                             stem=False)
                 if sen_ipo_tags == None:
                     confi += 0.15
                 elif sen_id in sen_ipo_tags:
                     confi += 0.3
                 if 'file' in sen_lemma and sen_lemma.index('file') > index:
                     confi += 0.2
                     comp_candidate = self.search_comp_with_flag(
                         sen, sen_lemma, sen_entity, comps, entities,
                         'file')
                 elif 'raise' in sen_lemma:
                     confi += 0.1
                     if ' '.join(sen_lemma).index('raise') < ' '.join(
                             sen_lemma).index(money):
                         comp_candidate = self.search_comp_with_flag(
                             sen, sen_lemma, sen_entity, comps, entities,
                             'raise')
                     elif 'raise by' in ' '.join(sen_lemma):
                         comp_candidate = self.search_comp_with_flag(
                             sen,
                             sen_lemma,
                             sen_entity,
                             comps,
                             entities,
                             'raise',
                             dirc=1)
                 if comp_candidate != None:
                     yield sp.join([comp_candidate, money, str(confi)])
                 else:
                     self.logger.debug(
                         'Error:\t |ipo_detect\tsearch_raise_fund|\tCannot find company to match price {}.\t|{}\n'
                         .format(money, sen_id))
     # Count stock share
     elif 'shares' in sen and 'NUMBER' in sen_ner:
         for (index, number) in search_tag(sen,
                                           'NUMBER',
                                           sen_ner,
                                           index=True):
             position = sen.index('shares')
             if index < position and position - index <= 4:
                 comp_candidate = self.search_comp_with_flag(
                     sen, sen_lemma, sen_entity, comps, entities, number)
                 confi = 0.7
                 if comp_candidate != None:
                     yield sp.join(
                         [comp_candidate, number + ' shares',
                          str(confi)])
                 else:
                     self.logger.debug(
                         'Error:\t |ipo_detect\tsearch_raise_fund|\tCannot find company to match stock share {}.\t|{}\n'
                         .format(number + ' shares', sen_id))
 def search_stock_price(self, sen_id, sen, sen_lemma, sen_ner, sen_entity):
     comps = [
         comp for comp in search_tag(sen, 'ORGANIZATION', sen_ner)
         if comp not in financial_terms
     ]
     entities = [comp for comp in search_tag(sen, 'O', sen_entity)]
     if 'MONEY' in sen_ner:
         for (index, money) in search_tag(sen, 'MONEY', sen_ner,
                                          index=True):
             if money.endswith('illion'):
                 continue
             if re.compile('^between \$ [0-9]+ and \$ [0-9]+$').match(
                     ' '.join(sen[index - 4:index + 2])):
                 continue
             if re.compile('^between \$ [0-9]+ and \$ [0-9]+$').match(
                     ' '.join(sen[index - 1:index + 5])):
                 money_range = ' '.join(sen[index - 1:index + 5])
                 comp_candidate = self.search_comp_with_flag(sen,
                                                             sen_lemma,
                                                             sen_entity,
                                                             comps,
                                                             entities,
                                                             money_range,
                                                             stem=False)
                 if comp_candidate != None:
                     yield sp.join([comp_candidate, money_range])
                 else:
                     self.logger.debug(
                         'Error:\t |ipo_detect\tsearch_stock_price|\tCannot find company to match price {}.\t|{}\n'
                         .format(money_range, sen_id))
             elif re.compile(
                     '.*(price|sell)~\^~([a-zA-Z0-9]*~\^~){,5}at.*').match(
                         sp.join(sen_lemma)):
                 flag_word = 'price' if 'price' in ' '.join(
                     sen_lemma) else 'sell'
                 if ' '.join(sen_lemma).index(flag_word) < ' '.join(
                         sen_lemma).index(money):
                     comp_candidate = self.search_comp_with_flag(sen,
                                                                 sen_lemma,
                                                                 sen_entity,
                                                                 comps,
                                                                 entities,
                                                                 flag_word,
                                                                 stem=False)
                     if comp_candidate != None:
                         yield sp.join([comp_candidate, money])
                     else:
                         self.logger.debug(
                             'Error:\t |ipo_detect\tsearch_stock_price|\tCannot find company to match price {}.\t|{}\n'
                             .format(money, sen_id))
             elif 'per share' in ' '.join(sen) and (
                     ' '.join(sen).index(money) <
                     ' '.join(sen).index('per share')):
                 comp_candidate = self.search_comp_with_flag(sen,
                                                             sen_lemma,
                                                             sen_entity,
                                                             comps,
                                                             entities,
                                                             'per',
                                                             stem=False)
                 if comp_candidate != None:
                     yield sp.join([comp_candidate, money])
                 else:
                     self.logger.debug(
                         'Error:\t |ipo_detect\tsearch_stock_price|\tCannot find company to match price {}.\t|{}'
                         .format(money, sen_id))
             elif 'stock' in sen and 'price' in sen:
                 comp_candidate = self.search_comp_with_flag(
                     sen, sen_lemma, sen_entity, comps, entities,
                     sen[index])
                 if comp_candidate != None:
                     yield sp.join([comp_candidate, money])
                 else:
                     self.logger.debug(
                         'Error:\t |ipo_detect\tsearch_stock_price|\tCannot find company to match price {}.\t|{}'
                         .format(money, sen_id))
    def search_ticker(self, sen_id, sen, sen_ner, stock_code):
        start = 0
        index_flag = 0
        keywords = ['ticker', 'symbol']
        com_candidates = [
            company for company in search_tag(sen, 'ORGANIZATION', sen_ner)
        ]
        flag = False
        symbol_code = ''
        if '-LRB- {} :'.format(stock_code) in ' '.join(sen):
            symbol_code = sen[sen.index('-RRB-') - 1]
            try:
                search_index = com_candidates.index(stock_code)
                if com_candidates.index(stock_code) > 0:
                    return '{}~^~{}~^~0.9'.format(
                        com_candidates[search_index - 1], symbol_code)
                else:
                    self.logger.debug(
                        'Error\t: NER should have company before -LRB- {} : -RRB-.\t|{}\n'
                        .format(stock_code, sen_id))
                    return '{}~^~{}~^~0.75'.format(sen[sen.index('-LRB-') - 1],
                                                   symbol_code)
            except:
                self.logger.debug(
                    "Error: \t {} is not in the company list.\t|{}\n".format(
                        stock_code, sen_id))
        for keyword in keywords:
            try:
                start = sen.index(keyword)
                index_flag = ' '.join(sen).index(keyword)
                break
            except ValueError:
                pass
            #   if start == 0:
            #       start = sen.index(stock_code)
        if start == 0:
            index_flag = ' '.join(sen).index(stock_code)

        symbol_code_distance = len(' '.join(sen))
        for ii in xrange(start + 1, len(sen)):
            if re.compile('^[A-Z]+$').match(
                    sen[ii]) != None and sen[ii] not in financial_terms:
                flag = True
                if symbol_code_distance > abs(' '.join(sen).index(sen[ii]) -
                                              index_flag):
                    symbol_code = sen[ii]
                    symbol_code_distance = abs(' '.join(sen).index(sen[ii]) -
                                               index_flag)
        if flag:
            try:
                search_index = com_candidates.index(stock_code)
                if search_index > 0:
                    # print com_candidates[com_candidates.index(stock_code)-1],symbol_code
                    if com_candidates[search_index - 1] in financial_terms:
                        return '~^~{}~^~0.5'.format(symbol_code)
                    return '{}~^~{}~^~0.8'.format(
                        com_candidates[search_index - 1], symbol_code)
                else:
                    # print stock_code,symbol_code
                    return '~^~{}~^~0.5'.format(symbol_code)
            except ValueError:
                self.logger.debug(
                    "Error:\t |ipo_detect\tsearch_ticker|\t{} is not in the company list.\t|{}\n"
                    .format(stock_code, sen_id))

        return ''