Exemplo n.º 1
0
 def initialize(self, rules):
     self.l_index = esm.Index()
     conds = {}  # (left, right) -> condition
     lpatterns = {}  # Left
     for rule in rules:
         rc = set()
         for p in rule.rule.patterns:
             if p.key_re in ("source", "^source$", "profile", "^profile$"):
                 continue
             pd = (p.key_re, p.value_re)
             cond = conds.get(pd)
             if not cond:
                 cond = len(conds)
                 conds[pd] = cond
                 self.conditions[cond] = (p.key_re, p.value_re)
                 li = lpatterns.get(p.key_re)
                 if not li:
                     li = len(lpatterns)
                     lpatterns[p.key_re] = li
                     self.enter(self.l_index, p.key_re, li)
                     self.r_index[li] = esm.Index()
                 self.enter(self.r_index[li], p.value_re, cond)
             rc.add(cond)
         self.rule_cond[rule] = rc
         for c in rc:
             self.cond_rules[c].add(rule)
     # Fix all indexes
     self.l_index.fix()
     for i in self.r_index.itervalues():
         i.fix()
Exemplo n.º 2
0
def createForeignCourtsDf(folder_path=None, file_name=None):
    # Get regex table
    file_name = 'foreign_courts_regex_20161007.csv'
    regex_table = []
    reader = helpers.unicode_csv_reader(open(folder_path + file_name))
    for row in reader:
        regex_table.append(row)

    def extract_key(v):
        return v[0]

    data = sorted(regex_table, key=extract_key)
    regex_result = [[k, [x[2] for x in g]]
                    for k, g in itertools.groupby(data, extract_key)]
    country_names = esm.Index()
    court_names = esm.Index()
    for word in regex_result:
        country_names.enter(word[0].encode('utf-8'), word[0])
        for row in word[1]:
            court_names.enter(row.encode('utf-8'), [word[0], row])
    country_names.fix()
    court_names.fix()
    regex_test = dict(
        (tuple([row[0], row[2]]), row[3:]) for row in regex_table)
    regex_df = pd.Series(regex_test)
    regex_df = pd.DataFrame(regex_df, columns=['info'])
    regex_df[['citation_type', 'country_id', 'court_code',
              'court_id']] = regex_df['info'].apply(pd.Series)
    regex_df.drop('info', inplace=True, axis=1)
    return country_names, court_names, regex_df
 def __init_tld_index():
     tldindex = esm.Index()
     tlds = (tldextract.TLDExtract()._get_tld_extractor().tlds)
     ldindex = esm.Index()
     for tld in tlds:
         tldindex.enter('.' + tld.encode('idna'))
     tldindex.fix()
     return tldindex
Exemplo n.º 4
0
 def __init__(self, d):
     self.d = d
     self.positive_sentiments = esm.Index()
     self.negative_sentiments = esm.Index()
     self.positive_situations = esm.Index()
     self.negative_situations = esm.Index()
     self.s_positive_sentiments = set()
     self.s_negative_sentiments = set()
     self.s_positive_situations = set()
     self.s_negative_situations = set()
Exemplo n.º 5
0
 def __init__(self):  
     self.lstrategy = [] 
     self.zwordindex = esm.Index()
     self.uniomindex = esm.Index()
     self.sProjectPath = os.path.dirname(os.path.realpath(__file__))
     self.sZaoyinPath = '%s/zaoyin.list' % (self.sProjectPath)
     self.unicomdetemeter = '%s/ChinaUniomDetermeter'%(self.sProjectPath)
     self.logger = logging.getLogger("intelligent")
     self.__loadStrategy()
     self.oPR = Predict()
Exemplo n.º 6
0
    def __init__(self, province_parser):
        self.content_length = 50
        self.province_parser = province_parser
        self.phone_index = esm.Index()
        for keyword in bid_conf.phone_keyword_list:
            self.phone_index.enter(keyword)
        self.phone_index.fix()

        self.address_index = esm.Index()
        for keyword in bid_conf.address_keyword_list:
            self.address_index.enter(keyword)
        self.address_index.fix()
def gen_game_index(file_path):
    game_index = esm.Index()
    line_num = len(["" for line in open(file_path, "r")])
    with tqdm.tqdm(total=line_num) as progress:
        valid_num = 0
        for line in file(file_path):
            progress.update(1)
            items1 = line.split('#@#')
            if len(items1) != 2:
                continue
            game_list = items1[1].split('@')
            for game in game_list:
                game2 = prepare_str(game, '')
                len_threshold = 6
                if len(game) >= len_threshold:
                    game_index.enter(game)
                    valid_num += 1
                if game2 != game and len(game2) >= len_threshold and len(
                        game2) <= max_len_threshold:
                    game_index.enter(game2)
                    valid_num += 1
            if debug == True:
                if valid_num >= 100000:
                    break
    print valid_num
    game_index.fix()
    return game_index
def gen_video_index(file_path):
    video_index = esm.Index()
    line_num = len(["" for line in open(file_path, "r")])
    with tqdm.tqdm(total=line_num) as progress:
        valid_num = 0
        for line in file(file_path):
            progress.update(1)
            items1 = line.split('\t')
            if len(items1) != 7:
                continue
            hit_count = int(items1[2])
            name = items1[3]
            alias_name = items1[4]
            serial = items1[5]
            alais_serial = items1[6]
            name_set = prepare_str_for_more([name, serial],
                                            [alias_name, alais_serial])
            len_threshold = 4
            if hit_count < 100:
                len_threshold = 6
            for name in name_set:
                if len(name) >= len_threshold and len(
                        name) <= max_len_threshold:
                    video_index.enter(name)
                    valid_num += 1
            if debug == True:
                if valid_num >= 100000:
                    break
    print valid_num
    video_index.fix()
    return video_index
def gen_novel_index(file_path):
    novel_index = esm.Index()
    line_num = len(["" for line in open(file_path, "r")])
    with tqdm.tqdm(total=line_num) as progress:
        valid_num = 0
        for line in file(file_path):
            progress.update(1)
            items1 = line.strip().split('\t')
            if len(items1) != 2:
                continue
            novel1 = items1[0]
            novel2 = prepare_str(novel1, '')
            times = int(items1[1])
            len_threshold = 4
            if times < 1000 and times >= 100:
                len_threshold = 6
            elif times < 100:
                len_threshold = 8
            if len(novel1) >= len_threshold and len(
                    novel1) <= max_len_threshold:
                novel_index.enter(novel1)
                valid_num += 1
            if novel1 != novel2 and len(novel2) >= len_threshold and len(
                    novel2) <= max_len_threshold:
                novel_index.enter(novel2)
                valid_num += 1
            if debug == True:
                if valid_num >= 100000:
                    break
    print valid_num
    novel_index.fix()
    return novel_index
Exemplo n.º 10
0
    def __init__(self, in_list):
        """

        :param in_list: A list with all the strings that we want
        to match against one or more strings using the "query" function.

        This list might be [str_1, str_2 ... , str_N] or something like
        [ (str_1, obj1) , (str_2, obj2) ... , (str_N, objN)]. In the first
        case, if a match is found this class will return [ str_N, ]
        in the second case we'll return [ [str_N, objN], ]

        """
        self._index = esm.Index()

        for item in in_list:

            if isinstance(item, tuple):
                in_str = item[0]
                in_str = in_str.encode(DEFAULT_ENCODING)
                self._index.enter(in_str, item)
            elif isinstance(item, basestring):
                item = item.encode(DEFAULT_ENCODING)
                self._index.enter(item, (item, ))
            else:
                raise ValueError(
                    'Can NOT build esm_multi_in with provided values.')

        self._index.fix()
Exemplo n.º 11
0
def filter_branch(title_all, good_type):

    file_path_branch = MEDIA_ROOT + '/taoke_data/branch/' + good_type + '.txt'
    if os.path.exists(file_path_branch) == True and len(title_all) > 0:

        with codecs.open(file_path_branch, 'r', 'utf8') as csvfile:
            index = esm.Index()
            branch_filter = []
            for line_one in csvfile:
                index.enter(line_one.strip())
            index.fix()

            a = index.query(title_all)
            for aaaaa in a:
                branch_filter.append(aaaaa[1])

            branch_filter_distinct = {
                k: branch_filter.count(k)
                for k in set(branch_filter)
            }
            str_branch = ""
            branch_filter_distinct = sorted(branch_filter_distinct.items(),
                                            key=lambda d: d[1],
                                            reverse=True)

            str_branch_distinct = ""

            for one_key in branch_filter_distinct:
                if len(one_key[0].strip()) > 1:
                    str_branch = str_branch + one_key[0] + ':' + str(
                        one_key[1]) + '\n'
                    str_branch_distinct = str_branch_distinct + one_key[0] + '|'

            return str_branch, str_branch_distinct
Exemplo n.º 12
0
    def __init__(self, plaintiff_conf_path, defendant_conf_path):

        self.plaintiff_pattern_list = open(plaintiff_conf_path,
                                           'r').read().split('\n')
        self.plaintiff_regex_list = []
        for plaintiff_pattern in self.plaintiff_pattern_list:
            if not plaintiff_pattern:
                continue
            self.plaintiff_regex_list.append(
                re.compile(unicode(plaintiff_pattern)))

        self.defendant_pattern_list = open(defendant_conf_path,
                                           'r').read().split('\n')
        self.defendant_regex_list = []
        for defendant_pattern in self.defendant_pattern_list:
            if not defendant_pattern:
                continue
            self.defendant_regex_list.append(
                re.compile(unicode(defendant_pattern)))

        self.bulletin_type_index = esm.Index()
        for bulletin_type in fygg_conf.bulletin_type_list:
            self.bulletin_type_index.enter(bulletin_type)
        self.bulletin_type_index.fix()

        self.bulletin_type_list = [
            u'其他', u'破产文书', u'公示催告', u'宣告失踪、死亡', u'公益诉讼', u'更正'
        ]
        self.norm_content_keyword = u'刊登版面'
        self.litiants_seps = [
            ',', ':', ',', ':', '。', '、', ";", ";", '\t', u'与'
        ]
        self.min_litigant_len = 2
        self.max_litigant_len = 40
        self.case_id_regex = re.compile(u'(\d+)\S+号|(\d+)\S+号')
Exemplo n.º 13
0
def makeACTree(wordList=[]):
    esmreIndex = esm.Index()
    for word in wordList:
        esmreIndex.enter(word)
    esmreIndex.fix()

    return esmreIndex
Exemplo n.º 14
0
    def __init__(self):
        self.min_money = 3000
        self.max_money = 10000000000
        self.content_length = 60
        self.deal_num = 10
        self.money_regex = re.compile('\d+.\d+')
        self.money_wan_regex = re.compile('\d+.\d+万')
        self.budget_index = esm.Index()
        for keyword in bid_conf.bid_budget_keyword_list:
            self.budget_index.enter(keyword)
        self.budget_index.fix()

        self.money_index = esm.Index()
        for keyword in bid_conf.bid_money_keyword_list:
            self.money_index.enter(keyword)
        self.money_index.fix()
Exemplo n.º 15
0
 def __init__(self, rules):
     super(XRuleLookup,
           self).__init__(sorted(rules, key=lambda x: x.preference))
     self.index = esm.Index()
     self.kwmask = None
     self.rule_masks = []
     self.initialize(rules)
Exemplo n.º 16
0
def init_white_host_engine():
    global white_engine
    white_engine = esm.Index()
    conn = connect(host="180.96.26.186",
                   port=33966,
                   user="******",
                   passwd="jshb114@nj",
                   db="adp")
    sql = "select a.usertags,a.host_set_object,a.plan_id from adp_group_info as a,adp_plan_info as b where a.plan_id=b.plan_id and a.enable =1 and b.enable=1 and a.mobile=2;"
    cursor = conn.cursor()
    cursor.execute(sql)
    res = cursor.fetchall()
    for it in res:
        usertags = it[0]
        json_host = json.loads(it[1])
        host_list = json_host["_include_host"]
        for host in host_list:
            if len(host) > 4:
                if host.startswith("*."):
                    host = host[1:]
                elif host.startswith("*"):
                    host = host[1:]
                if host.endswith("/*"):
                    host = host[0:-2]
                elif host.endswith("*"):
                    host = host[0:-1]
                elif host.endswith("/"):
                    host = host[0:-1]
                white_engine.enter(host)
            if len(host.split(".")) > 3:
                if host.startswith('.'):
                    host_pattern.add(host[1:])

    white_engine.fix()
    conn.close()
Exemplo n.º 17
0
def readXML():
    word_list = readKB()
    # word_list.append('PPCA')

    print('获取字典树trie')
    dic = esm.Index()
    for i in range(len(word_list)):
        word = word_list[i].lower()
        dic.enter(word)
    dic.fix()

    print('最大匹配')
    results = []
    with open(
            '/Users/ningshixian/PycharmProjects/keras_bc6_track1/sample/data/BIBIO/train/train.txt'
    ) as f:
        lines = f.readlines()
    for i in tqdm(range(len(lines))):
        line = lines[i]
        for tag in tag_list:
            line = line.replace(tag, '')
        line = max_match_cut(line, dic)
        # label = max_match_cut2(line, word_list)
        results.append(line)

    with open(
            '/Users/ningshixian/PycharmProjects/keras_bc6_track1/sample/data/BIBIO/train/train2.txt',
            'w') as f:
        for sentence in results:
            f.write(sentence)
Exemplo n.º 18
0
def esm_search_file(file_name, keywords):
    """
    find matches for keywods in file
    """
    print(datetime.now())
    if len(keywords) == 0:
        print("keywords number is zero.")
        return -1
    index = esm.Index()
    for i in range(len(keywords)):
        index.enter(keywords[i])
    index.fix()

    with open(file_name, "r") as read_fd:
        for line in read_fd:
            line = line.strip()
            if len(line) == 0:
                print("skip empty line")
                continue
            print("{0} has length {1}".format(line, len(line)))
            result = index.query(line)
            if len(result) == 0:
                print("find no match in {}".format(line))
                continue
            print("find {0} match in {1}".format(len(result), line))
            for i in range(len(result)):
                match = result[i]
                print("index:{0}, find:{1}".format(i, match))
                print("from {0} to {1} match {2}".format(
                    match[0][0], match[0][1] - 1, match[1]))
    print(datetime.now())
Exemplo n.º 19
0
def get_file_list(dir_name, filters_list):
    if dir_name is None or 0 == len(dir_name):
        return None

    index = esm.Index()
    for i in range(len(filters_list)):
        index.enter(filters_list[i])

    index.fix()
    files = []
    # print(len(filters_list))
    if os.path.isdir(dir_name):
        for parent, dirnames, filenames in os.walk(
                dir_name
        ):  # three parameters return 1.parent directory, 2.directorys, 3.files
            for filename in filenames:  # display file information
                result = index.query(filename)
                # if len(result) == 0:
                #    continue;
                if (len(filters_list) == 0) or (len(result)
                                                == len(filters_list)):
                    files.append(os.path.join(parent, filename))
    else:
        if len(index.query(dir_name)) != 0:
            files.append(dir_name)

    return files
Exemplo n.º 20
0
 def __init__(self, keyword, infile):
     self.keyword = keyword
     self.lLines = XLSDeal().XlsToList(infile)
     self.esmins = esm.Index()
     self.dup_list = []
     for word in noneed_word:
         self.esmins.enter(word.strip())
     self.esmins.fix()
Exemplo n.º 21
0
 def set_limit_ip_words(self):
     # 建立限定词组
     for ip_type in self.ip_words_dict:   # 遍历 每一个类型,对应查询到的IP词组,然后制订对应的Index
         ip_name_index = esm.Index()
         for name in self.ip_words_dict[ip_type]:
             ip_name_index.enter(name)
         ip_name_index.fix()
         self.limit_index_dict[ip_type] = ip_name_index
Exemplo n.º 22
0
 def __init__(self, date_parser):
     self.content_length = 50
     self.close_index = esm.Index()
     for keyword in bid_conf.bid_close_keyword_list:
         self.close_index.enter(keyword)
     self.close_index.fix()
     self.deal_num = 10
     self.date_parser = date_parser
Exemplo n.º 23
0
 def __init__(self, keys):
     import esm
     index = esm.Index()
     self.keys = keys
     for key in keys:
         index.enter(key)
     index.fix()
     self.A = index
Exemplo n.º 24
0
    def __init__(self, parser_tool, log):
        self.parser_tool = parser_tool
        self.log = log
        self.court_place_len = 20
        self.max_court_len = 20
        self.min_court_len = 5
        self.min_litigant_len = 2
        self.max_litigant_len = 40
        self.strip_list = ['\t\r\n', '\r\n', '\n\n', '\r', '\n']
        self.seps = ['\r', '\n', '。', ',']
        self.litiants_seps = [
            ',', ':', ',', ':', '。', '、', u'与', u'和', u'及', ";", ";", '\t', ' '
        ]

        self.litigant_regex_list = []
        for litigant_pattern in ktgg_conf.litigant_pattern_list:
            self.litigant_regex_list.append(re.compile(litigant_pattern))

        self.court_place_regex = re.compile(u'在(\S+庭)|(第\S+庭)')
        self.judge_regex = re.compile(u'[合议庭成员,承办人,审判长]:(\S+)')
        self.court_time_regex = re.compile(
            u'\d+年\d+月\d+日.*?\d{1,2}[::]\d{1,2}|\d+年\d+月\d+日.*?\S+[时点分]|\d+月\d+日.*?\S+[时点分]|\d+月\d+日.*?\d{1,2}:\d{1,2}|\d+年\d+月\d+日|二[〇0O○]\S+年\S+月\S+日\d{1,2}:\d{1,2}|二[〇0O○]\S+年\S+月\S+日\S+[时点分]'
        )
        self.court_regex = re.compile(u'在(\S+人民法院)')

        self.plaintiff_index = esm.Index()
        for keyword in ktgg_conf.plaintiff_keyword_list:
            self.plaintiff_index.enter(keyword)
        self.plaintiff_index.fix()

        self.defendant_index = esm.Index()
        for keyword in ktgg_conf.defendant_keyword_list:
            self.defendant_index.enter(keyword)
        self.defendant_index.fix()

        self.current_path = os.getcwd()
        self.basic_path = self.current_path[:self.current_path.
                                            rfind("i_entity_extractor")]
        self.config_path = self.basic_path + "i_entity_extractor/extractors/ktgg/simple2court_kv.conf"
        self.court_list = open(self.config_path).read().split('\n')
        self.court_kv = {}
        for court in self.court_list:
            tmp_list = court.split(',')
            if len(tmp_list) != 2:
                continue
            self.court_kv[unicode(tmp_list[0])] = unicode(tmp_list[1])
Exemplo n.º 25
0
def init_black_host_engine():
    global black_engine
    black_engine = esm.Index()
    with open("black_host", "r") as f:
        for line in f:
            line = line.strip()
            black_engine.enter(line)
    black_engine.fix()
Exemplo n.º 26
0
    def __init__(self):

        self.s_feat_dir = '%s/feature' % os.path.dirname(
            os.path.abspath(__file__))
        self.d_sep = {}
        self.e_brand = esm.Index()
        self.question = ''
        self.__load_feat()
Exemplo n.º 27
0
 def _build_cluster_models(self):
     self.cluster_models = []
     for cluster_id in self.motives_db:
         motives = [motif for count, motif in self.motives_db[cluster_id]]
         cluster_model = esm.Index()
         for motif in motives:
             cluster_model.enter(motif)
         cluster_model.fix()
         self.cluster_models.append(cluster_model)
Exemplo n.º 28
0
 def __init__(self, raw_data, is_marketing=True, is_dup=True):
     self.raw_data = raw_data
     self.esmins = esm.Index()
     self.dup_list = []
     self.is_market = is_marketing
     self.is_dup = is_dup
     for word in noneed_word:
         self.esmins.enter(word.strip())
     self.esmins.fix()
Exemplo n.º 29
0
 def __init__(self, keyword, day=30):
     self.keyword = keyword
     self.raw_data = es_query(keyword, day)
     self.esmins = esm.Index()
     self.extract = ExtractShortText()
     self.dup_list = []
     for word in noneed_word:
         self.esmins.enter(word.strip())
     self.esmins.fix()
Exemplo n.º 30
0
 def __init__(self,court_place_conf_path):
     court_place_list = open(court_place_conf_path).read().split('\n')
     self.court_place_index = esm.Index()
     for court_place in court_place_list:
         court_place = toolsutil.utf8_encode(court_place).strip()
         if not court_place:
             continue
         self.court_place_index.enter(court_place)
     self.court_place_index.fix()