Exemplo n.º 1
0
    def get_data(self, company_id):
        table_match_dict = robj.processdelta(company_id)
        deal_id = tuple(company_id.split('_'))
        import read_slt_info as read_slt_info
        upObj = read_slt_info.update(deal_id)
        docname_docid_lst = upObj.get_documentName_id()
        ph_doc_info_dict = {}
        doc_ph_info = {}
        for (doc_name, doc_id) in docname_docid_lst:
            doc_sp_lst = doc_name.split('_')
            ph = ''
            if len(doc_sp_lst) == 2:
                ph = doc_sp_lst[-1]
            if len(doc_sp_lst) == 4:
                ph = doc_sp_lst[-2] + doc_sp_lst[-1]
            ph = ph.replace('AR', 'FY')
            ph_doc_info_dict[ph] = (ph, doc_name, doc_id)
            doc_ph_info[doc_id] = ph

        sorted_ph_lst = report_year_sort.year_sort(ph_doc_info_dict.keys())
        len_phs = len(sorted_ph_lst)
        doc_sorted_lst = [ph_doc_info_dict[ph][-1] for ph in sorted_ph_lst]
        #print sorted_ph_lst
        #print doc_sorted_lst
        #sys.exit()
        new_table_match_dict = {}
        for doc_pair, pair_ars in table_match_dict.items():
            i1 = doc_sorted_lst.index(doc_pair[0])
            i2 = doc_sorted_lst.index(doc_pair[1])
            print 'doc_pair: ', doc_pair
            if (i1 < i2):
                new_table_match_dict[doc_pair] = pair_ars[:]
            else:
                print 'rev: ', doc_pair, ' == ', i1, i2
                new_pair_ars = []
                for pair in pair_ars:
                    new_pair_ars.append((pair[1], pair[0]))
                new_table_match_dict[(doc_pair[1],
                                      doc_pair[0])] = new_pair_ars[:]
        fname = os.path.join(self.output_path, company_id + '_table_delta.slv')
        sh = shelve.open(fname, 'n')
        sh['data'] = new_table_match_dict
        sh.close()
        return
Exemplo n.º 2
0
    def get_doc_pairs(self, company_id):
        deal_id = tuple(company_id.split('_'))
        import read_slt_info as read_slt_info
        upObj = read_slt_info.update(deal_id)
        docname_docid_lst = upObj.get_documentName_id()
        ph_doc_info_dict = {}
        doc_ph_info = {}
        #print docname_docid_lst
        #sys.exit()
        for (doc_name, doc_id) in docname_docid_lst:
            doc_sp_lst = doc_name.split('_')
            ph = ''
            if len(doc_sp_lst) == 2:
                ph = doc_sp_lst[-1]
            if len(doc_sp_lst) == 4:
                ph = doc_sp_lst[-2] + doc_sp_lst[-1]
            ph = ph.replace('AR', 'FY')
            ph_doc_info_dict[ph] = doc_id
            doc_ph_info[doc_id] = ph

        lmdb_folder = os.path.join(self.output_path, company_id)
        if not os.path.exists(lmdb_folder):
            os.mkdir(lmdb_folder)

        sorted_ph_lst = report_year_sort.year_sort(ph_doc_info_dict.keys())
        doc_pair_dict = {}
        for i, ph in enumerate(sorted_ph_lst):
            qh_flg1 = ph[:2]
            doc_id1 = ph_doc_info_dict[ph]
            for kph in sorted_ph_lst[i + 1:]:
                qh_flg2 = kph[:2]
                doc_id2 = ph_doc_info_dict[kph]
                if qh_flg1 == qh_flg2:
                    if (doc_id1, doc_id2) not in doc_pair_dict:
                        doc_pair_dict[(doc_id1, doc_id2)] = 1
                    break

        return doc_pair_dict.keys()
Exemplo n.º 3
0
    def get_sorted_doc_list(self, company_id):
        delta_table_match_dict = self.read_delta_data(company_id)
        #print delta_table_match_dict.keys()
        #sys.exit()
        deal_id = tuple(company_id.split('_'))
        import read_slt_info as read_slt_info
        upObj = read_slt_info.update(deal_id)
        docname_docid_lst = upObj.get_documentName_id()
        ph_doc_info_dict = {}
        doc_ph_info = {}
        for (doc_name, doc_id) in docname_docid_lst:
            doc_sp_lst = doc_name.split('_')
            ph = ''
            if len(doc_sp_lst) == 2:
                ph = doc_sp_lst[-1]
            if len(doc_sp_lst) == 4:
                ph = doc_sp_lst[-2] + doc_sp_lst[-1]
            ph = ph.replace('AR', 'FY')
            ph_doc_info_dict[ph] = (ph, doc_name, doc_id)
            doc_ph_info[doc_id] = ph

        lmdb_folder = os.path.join(self.output_path, company_id)
        if not os.path.exists(lmdb_folder):
            os.mkdir(lmdb_folder)

        dfname = os.path.join(lmdb_folder, 'doc_ph_info')
        self.lmdb_obj.write_to_lmdb(dfname, doc_ph_info, doc_ph_info.keys())

        sorted_ph_lst = report_year_sort.year_sort(ph_doc_info_dict.keys())
        len_phs = len(sorted_ph_lst)
        doc_sorted_lst = [ph_doc_info_dict[ph][-1] for ph in sorted_ph_lst]
        sorted_combination_doc_lst = []
        for i, ph in enumerate(sorted_ph_lst):
            map_doc_tup = doc_sorted_lst[i]
            if (ph[:2] != 'FY') and (i < len(sorted_ph_lst) - 2):
                next_ph = sorted_ph_lst[i + 1]
                if ph[:1] in ['Q', 'H'] and next_ph[:1] in ['F']: continue
                next_doc_tup = doc_sorted_lst[i + 1]
                sorted_combination_doc_lst.append((map_doc_tup, next_doc_tup))
            elif (ph[:2] == 'FY') and (i < len(sorted_ph_lst) - 2):
                next_fy = ph[:2] + str(int(ph[2:]) + 1)
                if next_fy not in sorted_ph_lst: continue
                next_fy_indx = sorted_ph_lst.index(next_fy)
                next_doc_tup = doc_sorted_lst[next_fy_indx]
                sorted_combination_doc_lst.append((map_doc_tup, next_doc_tup))

        ######################################################################
        project_id, url_id = company_id.split('_')
        norm_res_list = sObj.slt_normresids(project_id, url_id)
        doc_page_dict = {}
        for doc_tup in norm_res_list:
            doc_id, page_number, norm_table_id = doc_tup
            if doc_id not in doc_page_dict:
                doc_page_dict[doc_id] = {}
            if page_number not in doc_page_dict[doc_id]:
                doc_page_dict[doc_id][page_number] = []
            doc_page_dict[doc_id][page_number].append(norm_table_id)

        cache_xml_ids = {}
        val_cons_dict = {}
        doc_pair_table_pair_dict = {}
        doc_id_pairs = delta_table_match_dict.keys()
        for doc_pair in doc_id_pairs:
            hyp_doc, ref_doc = doc_pair
            hyf_ref_lst = delta_table_match_dict[doc_pair]
            for (hyp_list, ref_list) in hyf_ref_lst:
                pg1 = hyp_list[0].split('_')[-1]
                pg2 = ref_list[0].split('_')[-1]
                #print doc_pair, pg1, pg2, hyp_list, ref_list
                if (pg2 not in doc_page_dict[doc_pair[1]]) or (
                        pg1 not in doc_page_dict[doc_pair[0]]):
                    continue
                norm_table_id_hyps = doc_page_dict[doc_pair[0]][pg1]
                norm_table_id_refs = doc_page_dict[doc_pair[1]][pg2]
                selected_hyp = ''

                if (len(norm_table_id_hyps) == 1):
                    selected_hyp = norm_table_id_hyps[0]
                else:
                    n_hyp_list = []
                    for r in hyp_list:
                        n_hyp_list += r.split('#')

                    for norm_table_id_hyp in norm_table_id_hyps:
                        if (project_id, url_id,
                                norm_table_id_hyp) in cache_xml_ids:
                            xmlids1 = cache_xml_ids[(project_id, url_id,
                                                     norm_table_id_hyp)]
                        else:
                            xmlids1 = self.get_xml_id(project_id, url_id,
                                                      norm_table_id_hyp)
                            cache_xml_ids[(project_id, url_id,
                                           norm_table_id_hyp)] = xmlids1
                        s1 = sets.Set(xmlids1).intersection(
                            sets.Set(n_hyp_list))
                        if list(s1):
                            selected_hyp = norm_table_id_hyp
                            break

                selected_ref = ''

                if (len(norm_table_id_refs) == 1):
                    selected_ref = norm_table_id_refs[0]
                else:
                    n_ref_list = []
                    for r in ref_list:
                        n_ref_list += r.split('#')
                    for norm_table_id_ref in norm_table_id_refs:
                        if (project_id, url_id,
                                norm_table_id_ref) in cache_xml_ids:
                            xmlids1 = cache_xml_ids[(project_id, url_id,
                                                     norm_table_id_ref)]
                        else:
                            xmlids1 = self.get_xml_id(project_id, url_id,
                                                      norm_table_id_ref)
                            cache_xml_ids[(project_id, url_id,
                                           norm_table_id_ref)] = xmlids1

                        s1 = sets.Set(xmlids1).intersection(
                            sets.Set(n_ref_list))
                        if list(s1):
                            selected_ref = norm_table_id_ref
                            break

                #print ' doc_pair: ', doc_pair
                #print
                if (not selected_hyp) or (not selected_ref):
                    print 'mmmm Learning Error....'
                    continue
                    #sys.exit()
                if selected_hyp and selected_ref:
                    #print [selected_hyp, selected_ref]
                    if (doc_pair[0],
                            selected_hyp) not in doc_pair_table_pair_dict:
                        doc_pair_table_pair_dict[(doc_pair[0],
                                                  selected_hyp)] = []
                    #if selected_hyp not in doc_pair_table_pair_dict[doc_pair]:
                    #    doc_pair_table_pair_dict[doc_pair][selected_hyp] = []
                    if (doc_pair[1],
                            selected_ref) not in doc_pair_table_pair_dict[(
                                doc_pair[0], selected_hyp)]:
                        doc_pair_table_pair_dict[(doc_pair[0],
                                                  selected_hyp)].append(
                                                      (doc_pair[1],
                                                       selected_ref))
                        val_cons_dict[(doc_pair[1], selected_ref)] = 1

        init_values = []
        for k, vs in doc_pair_table_pair_dict.items():
            if k not in val_cons_dict:
                init_values.append([k])

        #print len(init_values)
        #sys.exit()

        flg = 1
        while flg:
            flg = 0

            new_init_values = []
            for init_val in init_values:
                last_key = init_val[-1]
                if last_key in doc_pair_table_pair_dict:
                    flg = 1
                    extended_pos = doc_pair_table_pair_dict[last_key]
                    print ' extended_pos: ', extended_pos, len(extended_pos)
                    new_init_val = init_val[:]
                    for e in extended_pos:
                        new_init_values.append(init_val[:] + [e])
                        print len(init_val[:]) + 1, ' === ALL: ', len_phs
                else:
                    new_init_values.append(init_val[:])
            init_values = new_init_values[:]
        '''                    
        new_init_values = []
        for init_value in init_values:
            l1 = map(lambda x:x[0]+'#'+x[1], init_value[:])
            s1 = sets.Set(l1)  
            flg = 0
            for init_value1 in init_values:
                l2 = map(lambda x:x[0]+'#'+x[1], init_value1[:])
                s2 = sets.Set(l2)
                if (s1 == s2): continue
                if s1.issubset(s2):
                   flg = 1
                   break   
            if (flg == 0):
               new_init_values.append(init_value)   
        '''
        ar = []
        for init_value in init_values:
            ar.append((len(init_value), init_value))

        ar.sort()
        ar.reverse()

        print 'Total ar: ', len(ar)
        for ar_elm in ar:
            print 'Len: ', ar_elm[0], 'ELMS: ', ar_elm[1]
        #sys.exit()

        ofname = os.path.join(lmdb_folder, 'doc_table_final_chain_pair')
        final_pair_dict = {
            'sorted_comb_list': sorted_combination_doc_lst,
            'doc_table_chain_pair_list': new_init_values
        }
        self.lmdb_obj.write_to_lmdb(ofname, final_pair_dict,
                                    final_pair_dict.keys())