Exemplo n.º 1
0
    def cal_q_d_log(self, q, dc=None, d=None, clean=False, content_field='content'):
        '''
        :param q: 这个q可以是string句子,可以是wordlist,这个我来处理,
        :param dc:
        :param d: d是{}类型,d[content_field]表示的是内容,这个内容是做过了基本处理的内容
        :param clean:
        :param content_field:
        :return:内容是P(w|d)
        '''

        if type(q) != list:
            q = basic_preprocess(q)
        if type(d[content_field]) != list:
            d[content_field] = basic_preprocess(d[content_field][0])

        if clean: q = clean_tools(q)
        if dc is None and d is not None:
            if clean:
                d[content_field] = clean_tools(d[content_field])
            dc = defaultdict(int)

            for w in d[content_field]:
                dc[w] += 1.0

        # print '[DEBUG] dc:', dc['US'], dc
        ret = 1.0
        for w in q:
            ret += math.log((dc[w] + self.lmd * self.C[w]) / float( len(d[content_field]) + self.lmd )) #self.lmd * self.C[w] + (1-self.lmd) * dc[w]

        return ret
Exemplo n.º 2
0
def test_2():
        q = 'US Military Crisis Response'
        q = cut_words(q)
        d1 = sample_doc  # 'US Military Crisis Response US Military Crisis Response US Military Crisis Response'
        d1 = basic_preprocess(d1)

        d2 = {
            'score': 9.644032,
            'key': 'ebola-1bbd62fe484a96be675ab80a304f0320a742a2da67f696cde413aee99e9f9349',
            'content': d1
            }
        lm = LMDirichlet()
        dc = lm.cal_dc(d1)

        lm.load(LMDirichlet_Json)
        print 'lm.C[US]:', lm.C['US']
        p1 = lm.cal_w_d('US', d1)
        p2 = lm.cal_q_d(q, d=d2)
        p3 = lm.cal_q_d_log(q, d=d2)
        p4  = lm.cal_q_d(q, d=d2, dc=dc)


        print "p w,d:", p1, math.log(p1)
        print "p q,d:", p2
        print "p q,d:", p3
        print "p d d by dc:", p4
Exemplo n.º 3
0
def test_3():
    from data_utils import basic_preprocess
    q = 'US Military Crisis Response'
    q = cut_words(q)
    d1 = sample_doc  # 'US Military Crisis Response US Military Crisis Response US Military Crisis Response'
    q_list = basic_preprocess(q)
    d1 = basic_preprocess(d1)

    d2 = {
        'score': 9.644032,
        'key': 'ebola-1bbd62fe484a96be675ab80a304f0320a742a2da67f696cde413aee99e9f9349',
        'content': d1
    }
    lm = LMDirichlet()
    dc = lm.cal_dc(d1)

#LMDirichlet_without_stem
    lm.load(LMDirichlet_Json)
    # print 'lm.C[US]:', lm.C['US']
    # print d1

    p1 = lm.cal_w_d('US', d1)
    p2 = lm.cal_q_d(q, d=d2)
    p3 = lm.cal_q_d_log(q, d=d2)
    p4 = lm.cal_q_d(q, d=d2, dc=dc)

    print "p w,d:", p1, math.log(p1)
    print "p q,d:", p2
    print "p q,d:", p3
    print "p d d by dc:", p4

    print "===================="
    print "q list:", q_list
    lm.load(LMDirichlet_without_stem)
    # print 'lm.C[US]:', lm.C['US']
    p1 = lm.cal_w_d(q_list[0], d1)
    p2 = lm.cal_q_d(q, d=d2)
    p3 = lm.cal_q_d_log(q, d=d2)
    p4 = lm.cal_q_d(q, d=d2, dc=dc)

    print "p w,d:", p1, math.log(p1)
    print "p q,d:", p2
    print "p q,d:", p3
    print "p d d by dc:", p4
Exemplo n.º 4
0
    def cal_q_d(self, q, dc=None, d=None, clean=False, content_field='content'):
        '''
        :param q: 必须是word list
        :param dc:
        :param d: d是{}类型,d[content_field]表示的是内容,这个内容是做过了基本处理的内容
        :param clean:
        :param content_field:
        :return:内容是P(w|d)
        '''
        # print "QUERY:", q
        if type(q) != list:
            print "before process q:", q
            q = basic_preprocess(q)
            print "after process q:", q
        if type(d[content_field]) != list:
            d[content_field] = basic_preprocess(d[content_field][0])

        if clean: q = clean_tools(q)
        if dc is None and d is not None:
            if clean:
                d[content_field] = clean_tools(d[content_field])
            dc = defaultdict(int)

            for w in d[content_field]:
                dc[w] += 1.0

            # print 'CHECKING..., doc len:', len(d[content_field])
            # for w in q:
            #     print "check w cnt:", w, dc[w]


        # print '[DEBUG] dc:', dc['US'], dc
        ret = 1.0
        for w in q:
            ret *= (dc[w] + self.lmd * self.C[w])  / (
                float(len(d[content_field]) + self.lmd) # * self.C[w] # * len(d[content_field])
            )
            # ret *= (dc[w] + self.lmd * self.C[w]) / (
            #     float(len(d[content_field]) + self.lmd)
            # ) #self.lmd * self.C[w] + (1-self.lmd) * dc[w]

        return ret
Exemplo n.º 5
0
def deal_nytimes(file_id,
                 in_dir,
                 out_dir,
                 json_dir,
                 stem_dir,
                 stem_jsdir,
                 overwrite=False):
    def get_content(blocks):
        try:
            max_len = max([len(b) for b in blocks])
            main = [b for b in blocks if len(b) == max_len][0].strip()
            blocks = [b for b in blocks if main.startswith(b.strip())]
            return blocks + [main]
        except Exception as e:
            logging.exception("[!][%s] Exception: %s", file_id, e)
            return []

    def write_line_data(file_name, key, words):
        with codecs.open(file_name, "w", "utf-8") as fl:
            fl.write("{} ".format(key))
            fl.write(",".join(words))
            fl.write("\n")

    in_file = in_dir.format(file_id)
    out_file = out_dir.format(file_id)
    json_file = json_dir.format(file_id)
    stem_file = stem_dir.format(file_id)
    stem_json = stem_jsdir.format(file_id)
    if exists(out_file) and exists(stem_file) and not overwrite:
        return False
    logging.info("[#] dealing file: %s", in_file)

    js = json.load(codecs.open(in_file, "r", "utf-8"))
    text = " ".join([js["title"]] + get_content(js["content"].values()))
    words = du.basic_preprocess(text)

    if not exists(out_file) or overwrite:
        write_line_data(out_file, js["doc_id"], words)

    stems = du.stemmer_by_porter(words)
    if not exists(stem_file) or overwrite:
        write_line_data(stem_file, js["doc_id"], stems)
    doc_dict = {"words": Counter(words), "id": file_id, "key": js["doc_id"]}
    if not exists(json_file) or overwrite:
        json.dump(doc_dict, codecs.open(json_file, "w", "utf-8"))

    if not exists(stem_json) or overwrite:
        doc_dict["words"] = Counter(stems)
        json.dump(doc_dict, codecs.open(stem_json, "w", "utf-8"))
    return True
Exemplo n.º 6
0
def deal_ebola(file_id,
               in_dir,
               out_dir,
               json_dir,
               stem_dir,
               stem_jsdir,
               overwrite=True):
    def write_line_data(file_name, key, words):
        with codecs.open(file_name, "w", "utf-8") as fl:
            fl.write("{} ".format(key))
            fl.write(",".join(words))
            fl.write("\n")

    in_file = in_dir.format(file_id)
    out_file = out_dir.format(file_id)
    json_file = json_dir.format(file_id)
    stem_file = stem_dir.format(file_id)
    stem_json = stem_jsdir.format(file_id)
    logging.info("[#] deal file_id: %s", file_id)
    if (exists(out_file) and not overwrite and exists(stem_file)
            and exists(json_file) and exists(stem_json)):
        return False
    logging.info("[#] dealing file_id: %s", file_id)
    js = json.load(codecs.open(in_file, "r", "utf-8"))
    text = parse_html(js["content"])
    words = du.basic_preprocess(text, length_limit=1)
    doc_dict = {"words": Counter(words), "id": file_id, "key": js["key"]}
    stem_words = du.stemmer_by_porter(words)
    if not exists(json_file) or overwrite:
        json.dump(doc_dict, codecs.open(json_file, "w", "utf-8"))
    if not exists(out_file) or overwrite:
        write_line_data(out_file, js["key"], words)
    if not exists(stem_file) or overwrite:
        write_line_data(stem_file, js["key"], stem_words)
    if not exists(stem_json) or overwrite:
        doc_dict["words"] = Counter(stem_words)
        json.dump(doc_dict, codecs.open(stem_json, "w", "utf-8"))
    return True