예제 #1
0
 def sentence_evaluate(self, sentence):
     seg_list = jieba.cut(
         string_parser(punc_file='../resource/punc_file.txt',
                       string_with_punc=sentence))
     seg_list = filter(lambda x: x != " ", seg_list)
     lists = list(seg_list)
     return lists
예제 #2
0
def main():
    conn = MySQLdb.connect(
                           host= args.host,
                           user=args.user,
                           passwd= args.passwd,
                           db= args.db,
                           charset='utf8')
    cur = conn.cursor()
    cur.execute('select id, content from t_news_detail order by rand() limit 200000')
    data = cur.fetchall()
# parse data by beautiful soup
    with open("../resource/split_words.txt", "wb") as f:
        for line in data:
            ids, content_html = line
            content = BeautifulSoup(content_html)
            text = content.get_text()
            try:
                lists = cut_sentence_new(text)
                for key2,value2 in enumerate(lists):
                    label = str(ids) + "_" + str(key2)
                    seg_list = jieba.cut(string_parser(punc_file = '../resource/punc_file.txt',string_with_punc = value2))
                    seg_list = filter(lambda x: x != " ", seg_list)
                    lists = list(seg_list)
                    #save sentence with length greater than 3
                    if(len(lists) >= 3):
                        vector = ",".join(lists)
                        f.write(label + ',' + vector + '\n')
            except:
                continue
    f.close()
def sentence_evaluate(sentence):
    seg_list = pseg.cut(
        string_parser(punc_file='../resource/punc_file.txt',
                      string_with_punc=sentence))
    seg_list = filter(
        lambda x: x.word != "" and x.word not in stopw and 'n' in x.flag and
        len(x.word) >= 2, seg_list)
    words = map(lambda x: x.word, seg_list)
    words = filter(lambda x: len(x) != 0, words)
    return words
예제 #4
0
def evaluate(content, keyword):
    scope = 6
    if keyword[0] is not None:
        try:
            model_word[keyword[0]]
        except:
            keyword = list(jieba.cut(string_parser(punc_file = '../resource/punc_file.txt',string_with_punc = keyword[0])))

    # split text to sentence
    lists = cut_sentence_new(content)
    dictssentence = {key: value.strip("\n") for key, value in enumerate(lists)}
    # split sentence to words only consider the sentence with len greater than 6
    # for word !!! top 5 words
    words_importance = evaluate_words(dictssentence,keyword)[:20]
    words = map(lambda (word, importances): word, words_importance)
    keywords_list = map(lambda x: word2vec_evaluate(x), words)
    keywords_list = filter(lambda x: len(x) != 0, keywords_list)
    denomiator = len(keywords_list)
    agg = reduce(lambda x,y : x + y, keywords_list)
    agg_final = map(lambda x: str(x),agg/denomiator)
    return agg_final
def evaluate():
    args = parser.parse_args()
    scope = 6
    content = args.content
    keyword = [args.keyword]
    if keyword[0] is not None:
        try:
            model_word[keyword[0]]
        except:
            keyword = list(
                jieba.cut(
                    string_parser(punc_file='../resource/punc_file.txt',
                                  string_with_punc=keyword[0])))

    # split text to sentence
    lists = cut_sentence_new(content)
    dictssentence = {key: value.strip("\n") for key, value in enumerate(lists)}
    # split sentence to words only consider the sentence with len greater than 6
    # for word !!! top 5 words
    words_importance = evaluate_words(dictssentence, keyword)[:20]
    words = map(lambda (word, importances): word, words_importance)
    # for sentence
    sentence_result_lists = evaluate_sentence(dictssentence)
    if len(sentence_result_lists) == 0:
        result_final = evaluate_sentence_tradition(dictssentence,
                                                   words_importance)
        outputString = dictssentence.get(result_final[0].ids, "")
        return (outputString, words)
    else:
        sentence = stdOut(sentence_result_lists, dictssentence, 6)
        if len(sentence) == 0:
            result_final = evaluate_sentence_tradition(dictssentence,
                                                       words_importance)
            outputString = dictssentence.get(result_final[0].ids, "")
            return (outputString, words)
        else:
            outputString = sentence[0].strs
            return (outputString, words)
예제 #6
0
def main():
    # load data
    conn = MySQLdb.connect(host=args.host,
                           user=args.user,
                           passwd=args.passwd,
                           db=args.db,
                           charset='utf8')
    cur = conn.cursor()
    cur.execute('select id, content_html from t_crawler_obj limit ' +
                args.file[0] + ',' + args.file[1])
    data = cur.fetchall()

    # load model
    model = doc2vec.Doc2Vec.load(args.model)
    # parse data by beautiful soup
    dicts1 = dict()
    for line in data:
        ids, content_html = line
        content = BeautifulSoup(content_html, "html.parser")
        dicts1[ids] = content.get_text()

    # split sentence # nested dict dict2-> key: paper, value: dicttmp-> key: sentence id, value: sentence string
    dicts2 = defaultdict(dict)
    for key, value in dicts1.items():
        lists = cut_sentence_new(value)
        dicttmp = dict()
        for key2, value2 in enumerate(lists):
            dicttmp[key2] = value2
        dicts2[key] = dicttmp

# split words dict3-> key: paper, value: dicttmp-> key: sentence id, value: sentence split list
    dicts3 = defaultdict(dict)
    analyse.set_stop_words('../resource/stop_words.txt')
    for key, value in dicts2.items():
        dicttmp = dict()
        for key2, value2 in value.items():
            seg_list = jieba.cut(
                string_parser(punc_file='../resource/punc_file.txt',
                              string_with_punc=value2))
            seg_list = filter(lambda x: x != " ", seg_list)
            lists = list(seg_list)
            if (len(lists) >= 3):  #save sentence with length greater than 3
                dicttmp[key2] = lists
        dicts3[key] = dicttmp


# vectorization and textrank

    for key, value in dicts3.items():
        dictrember = dict()
        X = list()
        i = 0
        for key2, value2 in value.items():
            dictrember[i] = key2  # i: X index; key2: sentence order
            X.append(model.infer_vector(value2))
            i += 1
        X = np.array(X, dtype='float32')
        distance_matrix = pairwise_distances(X, metric='cosine')
        rank = rankgetter(distance_matrix=distance_matrix,
                          dictrember=dictrember)
        j = 0
        try:
            lists = list()
            for info in rank:
                ind = info.ids  # sentence order
                tmp = dicts2[key][ind]
                tmp2 = filter(lambda x: is_chinese(x), tmp)
                if (len(tmp2) < 8 or contain_redundant(
                        redundant_dict='../resource/redundant_dict.txt',
                        string_with_redundant=dicts2[key][ind])):
                    continue
                j += 1
                result_str = removePrefix(dicts2[key][ind].replace(" ", ""),
                                          "”".decode("utf8"))
                result = distattr2(ind, result_str)
                lists.append(result)
                if (j >= args.top):
                    break

            stdOut = sorted(
                lists, key=lambda x: x.ids
            )  # print the result according to the order sentence
            for key3, sentence3 in enumerate(stdOut):
                print str(key) + " " + str(key3 + 1) + ": " + sentence3.strs

        except:
            print("No More Qualified Sentence!")