示例#1
0
def rule_check(src, tar, rule="louhao"):
    src = utils.clr(src)
    tar = utils.clr(tar)
    logger.debug("%s %s\n" % (src, tar))
    if rule == "louhao":
        #reg0 = re.compile("([一二三四五六七八九零]+?[号杠])(?:.*?)?([一二三四五六七八九零]+?[号杠$])")
        reg0 = myconfig.CHECK_RULE_LOUHAO
        src0 = re.findall(reg0, src)
        tar0 = re.findall(reg0, tar)
        if len(src0) > 0:
            src0 = "".join(src0[0])
        else:
            src0 = ""
        if len(tar0) > 0:
            tar0 = "".join(tar0[0])
        else:
            tar0 = ""
        print(src0, tar0, src, tar)
        if src0 == tar0 and not src0 == "":
            return True
        else:
            return False
    elif rule == "jieluxiang":
        #======
        #reg0 = re.compile("\D\D\D[街道路巷]")
        reg0 = myconfig.CHECK_RULE_JIEDAO
        src0 = re.findall(reg0, src)
        tar0 = re.findall(reg0, tar)
        if len(src0) > 0 and len(tar0) > 0:
            if src0[-1] == tar0[-1]:
                return True
        return False
    else:
        print(rule)
示例#2
0
def read_xlrd(filename):
    ad = xlrd.open_workbook(filename)
    sts = ad.sheets()
    rows = sts[0].get_rows()
    result = []
    for line in rows:
        k = line[14].value
        v = line[10].value
        k = utils.clr(k)
        v = utils.clr(v)
        data, label, s, r, c = hugry_match(matrix_build(k, v), k, v)
        yield (data, label, k, v)
示例#3
0
def read_txt(filename, shuffle):
    lines = codecs.open(filename, "r", "utf-8").readlines()
    for line in lines:
        if shuffle:
            line = lines[np.random.randint(len(lines))]
        line = line.split("&")[0]
        line = utils.clr(line)
        yield line
示例#4
0
def init_ner_train_data(filename):
    gen = read_txt(filename, shuffle=True)
    f = open(filename, "a+")
    for sent in gen:
        sent = utils.clr(sent)
        for char in sent:
            f.write("%s O\n" % char)
        f.write("\n")
    f.close()
示例#5
0
def xgboost_train_data_gen(cnt=myconfig.TRAIN_DATA, shuffle=True):
    X_train, y_train = [], []
    gen = os.walk("../address_gy/source/dct_file/dct_level")
    for path, _, files in gen:
        for filename in files:
            if filename == "tokens.txt":
                continue
            lines = open(os.path.join(path, filename), "r").readlines()
            for line in lines:
                if not k in dct_trans:
                    continue
                line = utils.clr(line)
                X_train.append(trans(line))
                filename = filename.split(".")[0]
                y_train.append(dct_trans(filename, global_dct))
    #print(len(lines))
    d_train = xgb.DMatrix(np.array(X_train).reshape(-1,
                                                    myconfig.LENTH_PADDING),
                          label=np.array(y_train).reshape(-1))
    return d_train, y_train
示例#6
0
def seperate_zhengz_address(filename):
    rt = open("/home/dell/data/zhengz_train.txt", "w+")
    wx = open("/home/dell/data/zhengz_dev.txt", "w+")
    tmp = []
    with open(filename) as f:
        lines = f.readlines()
        for line in lines:
            line = re.sub("[\r\n]", "", line)
            line = re.sub("NONE", "", line)
            line = re.sub(" ", "", line)
            line = utils.clr(line)
            if 'ROOT' in line:
                qua, ans = line.split('ROOT')
                rt.write("%s %s 0\n" % (qua, ans))
            else:
                if len(tmp) == 2:
                    rt.write("%s %s 1\n" % (tmp[0], tmp[1]))
                    tmp = []
                else:
                    tmp.append(line)
    rt.close()
    wx.close()