示例#1
0
    def index_edit(self, option="question"):
        dataset_index = "zhidao_{}".format(option)
        gcounter[option] = 1
        dirname = getLocalFile("label0627/*{}*xls*".format(option))
        print dirname
        filenames = glob.glob(dirname)

        ids = set()

        for filename in filenames:
            print filename

            gcounter["files"] += 1
            ret = libfile.readExcel(
                ["category", "question", "answers", "type"],
                filename,
                start_row=1)
            if ret:
                for items in ret.values():
                    for item in items:
                        gcounter["items"] += 1

                        qa = u"{}{}".format(item["question"], item["answers"])
                        item["id"] = es_api.gen_es_id(qa)
                        if item["id"] in ids:
                            gcounter["items_skip_dup"] += 1
                            continue

                        if not item["type"] in [1, "1"]:
                            gcounter["items_skip_drop"] += 1
                            continue

                        skip_words = self.api_nlp.detect_skip_words(
                            qa,
                            check_list=["skip_words_all", "skip_words_zhidao"])
                        if skip_words:
                            print "SKIP", u"/".join(
                                skip_words), "\t---\t", item[
                                    "question"], "\t---\t", item["answers"]
                            gcounter["items_skip_minganci"] += 1
                            continue

                        ids.add(item["id"])
                        item_new = {}
                        for p in ["question", "answers", "id"]:
                            item_new[p] = item[p]
                        self.upload(dataset_index, item_new)
        self.upload(dataset_index)
        gcounter["esdata"] = len(ids)
示例#2
0
    def index_xianer12w_test(self):
        dataset_index = "xianer12w_test"
        filename = getLocalFile("input/chat8xianer12w.txt")
        visited = set()
        for line in libfile.file2list(filename):

            if line in visited:
                continue

            visited.add(line)
            gcounter["lines"] += 1
            item = {
                "question": line,
                "answers": u"无语",
                "id": es_api.gen_es_id(line)
            }

            self.upload(dataset_index, item)
        self.upload(dataset_index)
示例#3
0
def fudan_ea_to_json(entity,
                     attribute,
                     attribute_name,
                     extra_tag,
                     values,
                     category=None,
                     searchscore=None,
                     alias=[]):
    """
    :param entity: type(entity) is unicode
    """
    tags = [entity, entity.lower(), entity.upper(), extra_tag]
    entity_name = entity

    aliases = alias
    m = regdropbrackets.match(entity)
    if m:
        entity_name = m.group(1)
        tags.append(entity_name.lower())
        tags.append(entity_name.upper())

    eid = gen_es_id('{}__{}'.format(entity.encode('utf-8'), attribute))

    # entity(index: yes) used for full text retrieval, tags(not_analyzed) used for exactly match
    ret = {
        'id': eid,
        'entity': entity,
        'entity_name': entity_name,
        'attribute': attribute,
        'attribute_name': attribute_name,
        'value': values[0] if len(values) > 0 else '',
        'values': values,
        'tags': list(set(tags)),
        'searchscore': searchscore,
    }
    if category:
        ret.update({'category': category})
    if searchscore:
        ret.update({'searchscore': searchscore})
    return ret
示例#4
0
    def _index_qa(self, filenames, dataset_index, filter_option=0):
        ids = set()

        for filename in filenames:
            print filename

            gcounter["files"] += 1
            ret = libfile.readExcel(["category", "question", "answers"],
                                    filename,
                                    start_row=1)
            if ret:
                for items in ret.values():
                    for item in items:
                        gcounter["items"] += 1

                        item["id"] = es_api.gen_es_id(item["question"])
                        if item["id"] in ids:
                            continue

                        label = self.filter_qa_by_label(
                            "{}".format(item["category"]),
                            item["question"],
                            item["answers"],
                            filter_option=filter_option)
                        if label:
                            print "SKIP", label, "\t---\t", item[
                                "question"], "\t---\t", item["answers"]
                            gcounter["esdata_label_{}".format(label)] += 1
                            if filter_option in [1]:
                                continue

                        ids.add(item["id"])
                        item_new = {}
                        for p in ["question", "answers", "id"]:
                            item_new[p] = item[p]
                        self.upload(dataset_index, item_new)
        self.upload(dataset_index)
        gcounter["esdata"] = len(ids)
示例#5
0
def ea_to_json(entity, attribute, attribute_name, extra_tag, values):
    """
    :param entity: type(entity) is unicode
    """

    tags = [entity, entity.lower(), entity.upper(), extra_tag]
    alias = get_all_aliases(entity)
    if alias:
        tags.extend(list(alias))


#    alias_mapping = load_alias_mapping()
#    if entity in alias_mapping:
#        tags.extend(alias_mapping[entity])

    entity_name = entity

    m = regdropbrackets.match(entity)
    if m:
        entity_name = m.group(1)
        tags.append(entity_name.lower())
        tags.append(entity_name.upper())

    eid = gen_es_id('{}__{}'.format(entity.encode('utf-8'), attribute))

    # entity(index: yes) used for full text retrieval, tags(not_analyzed) used for exactly match
    return {
        'id': eid,
        'entity': entity,
        'entity_name': entity_name,
        'attribute': attribute,
        'attribute_name': attribute_name,
        'value': values[0],
        'values': values,
        'tags': list(set(tags))
    }
示例#6
0
    def init_zhidao_qa(self):
        #clean rewrite

        dataset_index_list = [
            "qa0708query",
            "qa0708question",
        ]
        for dataset_index in dataset_index_list:
            dirname = getLocalFile("raw/{}/*".format(dataset_index))
            map_items = {}
            for filename in glob.glob(dirname):

                gcounter["files"] += 1
                ret = libfile.readExcel(
                    ["category", "question", "answers", "type"],
                    filename,
                    start_row=1)
                if ret:
                    for items in ret.values():
                        for item in items:
                            gcounter["items"] += 1

                            qa = u"{}{}".format(item["question"],
                                                item["answers"])
                            item["id"] = es_api.gen_es_id(qa)
                            if item["id"] in map_items:
                                gcounter["items_skip_dup"] += 1
                                continue

                            if not item["type"] in [1, "1"]:
                                gcounter["items_skip_drop"] += 1
                                continue

                            item["answers"] = clean_answer(u"{}".format(
                                item["answers"]))
                            item["question"] = clean_question(u"{}".format(
                                item["question"]))
                            if len(item["answers"]) < 2:
                                gcounter["items_skip_empty_answer"] += 1
                                continue

                            skip_words = self.api_nlp.detect_skip_words(
                                item["answers"],
                                check_list=[
                                    "skip_words_all", "skip_words_zhidao"
                                ])
                            if skip_words:
                                print "SKIP", u"/".join(
                                    skip_words), "\t---\t", item[
                                        "question"], "\t---\t", item["answers"]
                                gcounter["items_skip_minganci"] += 1
                                continue

                            item_new = {"source": dataset_index}
                            for p in ["question", "answers", "id"]:
                                item_new[p] = item[p]
                            map_items[item_new["question"]] = item_new

            gcounter["init_from_{}".format(dataset_index)] = len(map_items)
            print len(map_items)

            filename = getLocalFile("temp/{}.xls".format(dataset_index))
            items = sorted(map_items.values(), key=lambda x: x["question"])
            libfile.writeExcel(items,
                               ["label", "question", "answers", "source"],
                               filename)
示例#7
0
    def init_xianer7w_rewrite(self):
        dataset_index = "xianer7w_rewrite"
        gcounter[dataset_index] = 1

        ids = set()

        filename = getLocalFile("raw/rewrite/xianer7w_rewrite_map.xlsx")
        print filename

        gcounter["files"] += 1
        ret = libfile.readExcel(["question", "old_answers", "answers"],
                                filename,
                                start_row=0)

        #collect answer mapping
        map_answers = {}
        for item in ret.values()[0]:
            if item.get("old_answers"):
                a_old = item["old_answers"].strip()
            if item.get("answers"):
                a = item["answers"].strip()

            if a and a_old:
                map_answers[a_old] = a

        print len(map_answers)

        filename = getLocalFile("raw/rewrite/xianer7w_rewrite.xlsx")
        print filename

        gcounter["files"] += 1
        ret = libfile.readExcel(["question", "old_answers", "answers"],
                                filename,
                                start_row=0)

        #use mapping
        items = []
        for item in ret.values()[0]:
            gcounter["items"] += 1
            q = item["question"]
            if item["old_answers"]:
                a = map_answers.get(item["old_answers"])
            else:
                a = ""

            if not a:
                #print "SKIP no mapping", q, item["old_answers"]
                gcounter["items_no_mapping"] += 1
                continue

            qa = q + a
            item["id"] = es_api.gen_es_id(q)
            if item["id"] in ids:
                gcounter["items_skip_dup"] += 1
                continue

            skip_words = self.api_nlp.detect_skip_words(
                qa, check_list=["skip_words_all"])
            if skip_words:
                print "SKIP", u"/".join(
                    skip_words), "\t---\t", item["question"], "\t---\t", a
                gcounter["items_skip_minganci"] += 1
                continue

            ids.add(item["id"])
            item_new = {
                "question": q,
                "answers": a,
                "id": item["id"],
            }
            items.append(item_new)

        gcounter["qa0708rewrite"] = len(ids)

        filename = getLocalFile("temp/qa0708rewrite.xls")
        libfile.writeExcel(items, ["label", "question", "answers"], filename)
示例#8
0
    def _merge_chat(self, filenames, option):
        filename_todo = getLocalFile("input/{}_todo.txt".format(option))
        print "filename_todo", filename_todo
        q_todo = set()
        if os.path.exists(filename_todo):
            q_todo = libfile.file2set(filename_todo)
            gcounter["q_todo"] = len(q_todo)
            print "filename_todo", filename_todo, len(q_todo)

        filename_skip = getLocalFile("input/{}_skip.txt".format(option))
        print "filename_skip", filename_skip
        q_skip = set()
        if os.path.exists(filename_skip):
            q_skip = libfile.file2set(filename_skip)
            gcounter["q_skip"] = len(q_skip)
            print "filename_skip", filename_skip, len(q_skip)

        data = {}
        q_all = set()
        for filename in filenames:
            #print filename
            gcounter["files"] += 1
            ret = libfile.readExcel(["category", "question", "answers"],
                                    filename,
                                    start_row=1)
            if ret:
                for items in ret.values():
                    for item in items:
                        gcounter["items"] += 1

                        q_all.add(item["question"])

                        if q_skip and item["question"] in q_skip:
                            gcounter["items_skip"] += 1
                            continue

                        item["id"] = es_api.gen_es_id(item["question"] +
                                                      item["answers"])
                        if item["id"] in data:
                            continue

                        for dataset_index in ["chat8cmu6w", "chat8xianer12w"]:
                            if dataset_index in filename:
                                gcounter["from_" + dataset_index] += 1

                        label = self.filter_qa_by_label(
                            item["category"], item["question"],
                            item["answers"])
                        if label:
                            item["label"] = label
                            #print "SKIP", label, "\t---\t", item["question"], "\t---\t", item["answers"]
                            #gcounter["_esdata_label_{}".format(label)]+=1
                        #elif not self.api_nlp.is_question_baike(item["question"]):
                        #    item["label"] = u"百科"
                        else:
                            item["label"] = u""
                        xlabel = re.sub(":.*$", "", item["label"])
                        gcounter["data_with_label_{}".format(xlabel)] += 1

                        data[item["id"]] = item
                        item_new = {}
                        for p in ["question", "answers", "id"]:
                            item_new[p] = item[p]

        gcounter["data"] = len(data)
        results = sorted(data.values(), key=lambda x: x["question"])
        print len(data), len(results)
        filename_output = getLocalFile("output/edit_{}.xls".format(option))
        libfile.writeExcel(results, ["label", "question", "answers"],
                           filename_output)

        filename_output = getLocalFile(
            "edit0623/sample1000_edit_{}.xls".format(option))
        libfile.writeExcel(libdata.items2sample(data.values(), limit=1000),
                           ["label", "question", "answers"], filename_output)

        if q_todo:
            q_todo.difference_update(q_all)
            filename_output = getLocalFile(
                "edit0623/question_miss_{}.xls".format(option))
            libfile.lines2file(sorted(list(q_todo)), filename_output)
            gcounter["q_all"] = len(q_all)
            gcounter["q_miss"] = len(q_todo)

        page_size = 2000
        max_page = len(results) / page_size + 1
        for i in range(max_page):
            filename_output = getLocalFile("edit0623/edit_{}_{}.xls".format(
                option, i))
            #print filename_output
            idx_start = i * page_size
            idx_end = min(len(results), (i + 1) * page_size)
            libfile.writeExcel(results[idx_start:idx_end],
                               ["label", "question", "answers"],
                               filename_output)
示例#9
0
    def test(self, dataset_index="chat8xianer12w", option="query"):
        filename_todo = getLocalFile("input/{}_todo.txt".format(option))
        print "filename_todo", filename_todo
        q_todo = set()
        if os.path.exists(filename_todo):
            q_todo = libfile.file2set(filename_todo)
            gcounter["q_todo"] = len(q_todo)
            print "filename_todo", filename_todo, len(q_todo)

        filename_skip = getLocalFile("input/{}_skip.txt".format(option))
        print "filename_skip", filename_skip
        q_skip = set()
        if os.path.exists(filename_skip):
            q_skip = libfile.file2set(filename_skip)
            gcounter["q_skip"] = len(q_skip)
            print "filename_skip", filename_skip, len(q_skip)

        data = {}
        q_all = set()
        dirname = getLocalFile(
            "output0623/{}*worker*json.txt".format(dataset_index))

        for filename in glob.glob(dirname):
            print filename
            gcounter["files"] += 1

            for line in libfile.file2list(filename):
                entry = json.loads(line)
                query = entry["query"]

                #print entry.keys()
                if "items_all" not in entry:
                    gcounter["selected_no_data"] += 1
                    continue
                elif len(entry["items_all"]) == 0:
                    gcounter["selected_no_item"] += 1
                    continue

                if q_skip and query in q_skip:
                    gcounter["items_skip"] += 1
                    q_all.add(query)
                    continue

                if self.api_nlp.detect_skip_words(query):
                    gcounter["selected_query_skipwords"] += 1
                    q_all.add(query)
                    continue

                items_select = self.api_nlp.select_qapair_0624(
                    query, entry["items_all"])
                if items_select:
                    gcounter["selected_yes"] += 1
                    q_all.add(query)
                else:
                    gcounter["selected_no"] += 1

                for item in items_select:
                    item["id"] = es_api.gen_es_id(item["question"] +
                                                  item["answers"])
                    if item["id"] in data:
                        continue

                    label = self.filter_qa_by_label("", item["question"],
                                                    item["answers"])
                    if label:
                        item["label"] = label
                    else:
                        item["label"] = u""
                    xlabel = re.sub(":.*$", "", item["label"])
                    gcounter["data_with_label_{}".format(xlabel)] += 1
                    gcounter["items"] += 1

                    data[item["id"]] = item
                #ret = libfile.readExcel(["category","question","answers"], filename, start_row=1)

        if q_todo:
            q_todo.difference_update(q_all)
            filename_output = getLocalFile(
                "edit0623/query_miss_{}.xls".format(option))
            libfile.lines2file(sorted(list(q_todo)), filename_output)
            gcounter["q_all"] = len(q_all)
            gcounter["q_miss"] = len(q_todo)