Пример #1
0
def process_washington_post(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            obj = json.loads(line)
            obj['kicker'] = filter_kicker(obj)
            if obj['kicker'] is False:
                continue
            obj['body'] = extract_body([obj['contents']])

            # to lower case
            obj['title'] = str(obj['title']).lower()
            obj['body'] = str(obj['body']).lower()

            # stemming
            w_list = cfg.word_cut(obj['body'])
            for i in range(len(w_list)):
                if w_list[i].isalpha():
                    w_list[i] = stemmer.stem(w_list[i])
            obj['body'] = ' '.join(w_list)
            w_list = cfg.word_cut(obj['title'])
            for i in range(len(w_list)):
                if w_list[i].isalpha():
                    w_list[i] = stemmer.stem(w_list[i])
            obj['title'] = ' '.join(w_list)

            del obj['contents']
            obj['title_body'] = (str(obj['title']) + ' ' +
                                 str(obj['body'])).lower()
            obj['title_author_date'] = (str(obj['title']) + ' ' +
                                        str(obj['author']) + ' ' +
                                        str(obj['published_date'])).lower()
            doc = json.dumps(obj)
            # insert data
            res = es.index(index=INDEX_NAME, id=obj['id'], body=doc)
Пример #2
0
def words_index_single(line, filter_kicker):
	obj = json.loads(line)
	doc_id = obj['id']
	contents = obj['contents']
	doc = ""
	for li in contents:
		if type(li).__name__ == 'dict':
			if 'type' in li and li['type'] == 'kicker':
				# skip filter kickers
				if li['content'] in filter_kicker.keys():
					return ()
			if 'subtype' in li and li['subtype'] == 'paragraph':
				paragraph = li['content'].strip()
				# Replace <.*?> with ""
				paragraph = re.sub(r'<.*?>', '', paragraph)
				doc += ' ' + paragraph
	doc = doc.strip()
	w_list = cfg.word_cut(doc)
	w_list = set(w_list)
	res = []
	for w in w_list:
		ds = set()
		ds.add(doc_id)
		res.append((w, ds))
	return res
Пример #3
0
def calc_score(line, words_df, query, avgdl, flag=False):
    k1 = 1.5
    b = 0.75
    obj = line
    if not flag:
        obj = json.loads(line)
    body = extract_body([obj['contents']])
    doc_id = obj['id']
    w_list = cfg.word_cut(body)
    # calc tf for the doc
    tf = {}
    for w in w_list:
        if w in tf:
            tf[w] += 1
        else:
            tf[w] = 1
    # calc bm25 for the doc
    score = 0.0
    for w in query:
        tfi = 0
        if w in tf:
            tfi = tf[w]
        dfi = 1e-7
        if w in words_df.value:
            dfi = words_df.value[w]
        dl = len(w_list)
        N = cfg.DOCUMENT_COUNT
        score += np.log(N / dfi) * ((k1 + 1) * tfi) / (k1 * (
            (1 - b) + b * dl / avgdl) + tfi)
    return (score, doc_id)
Пример #4
0
def split_body(args=None):
	body, max_length = args
	max_length = int(max_length)
	w_list = cfg.word_cut(body)
	if len(w_list) <= max_length-2:
		return body
	head_len = int((max_length - 2) / 2)
	tail_len = int(max_length - 2 - head_len)
	return ' '.join(w_list[:head_len]) + ' '.join(w_list[-tail_len:])
Пример #5
0
def process_wiki(filepath):
    # load case
    case_mp = {}
    with open(path_mp['DataPath'] + path_mp['entities'], 'r',
              encoding='utf-8') as f:
        li = []
        mp = {}
        topic_id = ''
        for line in f:
            topic_id_tmp = re.search(r'<num>.*?</num>', line)
            if topic_id_tmp is not None:
                if len(li) > 0:
                    case_mp[topic_id] = li
                    li = []
                topic_id = topic_id_tmp
                topic_id = topic_id.group(0)[5 + 9:-7]
            doc_id = re.search(r'<docid>.*?</docid>', line)
            if doc_id is not None:
                doc_id = doc_id.group(0)[7:-8]
                li.append(doc_id)
            entity_id = re.search(r'<id>.*?</id>', line)
            if entity_id is not None:
                entity_id = entity_id.group(0)[5:-6]
                mp['id'] = entity_id
            mention = re.search(r'<mention>.*?</mention>', line)
            if mention is not None:
                mention = mention.group(0)[9:-10]
                mp['mention'] = mention.lower()
            link = re.search(r'<link>.*?</link>', line)
            if link is not None:
                link = link.group(0)[6:-7]
                mp['link'] = link.lower()
                li.append(mp)
                mp = {}
        if len(li) != 0:
            case_mp[topic_id] = li
            li = []
    # find entity wiki page
    for topic_id in case_mp:
        for entity in case_mp[topic_id][1:]:
            dsl = {"size": 100, 'query': {'match': {'inlink': entity['link']}}}
            res = es.search(index=SEARCH_NAME, body=dsl)
            print(entity['id'], len(res['hits']['hits']))
            for ri in res['hits']['hits']:
                obj = ri['_source']
                obj['inlink'] = entity['link']
                # stemming
                w_list = cfg.word_cut(obj['body'])
                for i in range(len(w_list)):
                    if w_list[i].isalpha():
                        w_list[i] = stemmer.stem(w_list[i])
                obj['body'] = ' '.join(w_list)
                doc = json.dumps(obj)
                # insert data
                res = es.index(index=INDEX_NAME, body=doc)
Пример #6
0
def process(obj):
    obj['body'] = extract_body([obj['contents']])

    # to lower case
    obj['title'] = str(obj['title']).lower()
    obj['body'] = str(obj['body']).lower()

    # stemming
    w_list = cfg.word_cut(obj['body'])
    for i in range(len(w_list)):
        if w_list[i].isalpha():
            w_list[i] = stemmer.stem(w_list[i])
    obj['body'] = ' '.join(w_list)
    w_list = cfg.word_cut(obj['title'])
    for i in range(len(w_list)):
        if w_list[i].isalpha():
            w_list[i] = stemmer.stem(w_list[i])
    obj['title'] = ' '.join(w_list)

    del obj['contents']
    obj['title_body'] = (str(obj['title']) + ' ' + str(obj['body'])).lower()
    obj['title_author_date'] = (str(obj['title']) + ' ' + str(obj['author']) +
                                ' ' + str(obj['published_date'])).lower()
    return obj
Пример #7
0
def tfidf_index_single(line, filter_kicker, words_mp, num):
	obj = json.loads(line)
	doc_id = obj['id']
	contents = obj['contents']
	doc = ""
	for li in contents:
		if type(li).__name__ == 'dict':
			if 'type' in li and li['type'] == 'kicker':
				# skip filter kickers
				if li['content'] in filter_kicker.keys():
					return ()
			if 'subtype' in li and li['subtype'] == 'paragraph':
				paragraph = li['content'].strip()
				# Replace <.*?> with ""
				paragraph = re.sub(r'<.*?>', '', paragraph)
				doc += ' ' + paragraph
	doc = doc.strip()
	w_list = cfg.word_cut(doc)
	num = int(num)
	# calculate term frequency for each word in the str
	tf = {}
	for w in w_list:
		if w in tf:
			tf[w] += 1
		else:
			tf[w] = 1
	# calculate idf and tf-idf for each word
	tfidf_val = {}
	for w in w_list:
		# word not in vocabulary
		if w not in words_mp:
			continue
		idf = np.log(cfg.DOCUMENT_COUNT * 1.0 / len(words_mp[w]))
		tfidf_val[w] = tf[w] * 1.0 * idf
	# sort by tf-idf, combine top inverted file line number list
	tfidf_val = sorted(tfidf_val.items(), key=lambda d: d[1], reverse=True)
	res = set()
	for i in range(min(num, len(tfidf_val))):
		w = tfidf_val[i][0]
		res = res | set(words_mp[w])
	return doc_id + ' ' + ' '.join(res)
Пример #8
0
def gen_sample(args=None):
    max_length = args[0]
    max_length = int(max_length)
    # read all the doc, load as json, line count start from 1
    WashingtonPost = {}
    with open(path_mp['DataPath'] + path_mp['WashingtonPost'],
              'r',
              encoding='utf-8') as f:
        for line in tqdm(f):
            obj = json.loads(line)
            doc_id = obj['id']
            WashingtonPost[doc_id] = obj
    print('WashingtonPost dataset loaded.')
    # read topics idx
    topics_mp = {}
    with open(cfg.OUTPUT + 'topics_index.txt', 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            li = line[:-1].split(' ')
            topics_mp[li[0]] = set(li[1:])
    print('Topics idx loaded.')
    # read tfidf_mp
    tfidf_mp = {}
    with open(cfg.OUTPUT + 'tfidf_index.txt', 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            li = line[:-1].split(' ')
            tfidf_mp[li[0]] = li[1:]
    tfidf_list = list(tfidf_mp.keys())
    print('TFIDF idx loaded.')
    # read words_mp
    words_index = {}
    with open(cfg.OUTPUT + 'words_index.txt', 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            li = line[:-1].split(' ')
            words_index[li[0]] = set(li[1:])
    print('words idx loaded.')

    with open(cfg.OUTPUT + 'Dataset_BertCls.txt', 'w',
              encoding='utf-8') as out:
        for cur_id in tqdm(tfidf_list):
            obj = WashingtonPost[cur_id]
            contents = obj['contents']
            title = obj['title']
            author = obj['author']
            date = obj['published_date']
            body = ""
            topic_name = ""
            for li in contents:
                if type(li).__name__ == 'dict':
                    if 'type' in li and li[
                            'type'] == 'kicker' and topic_name == "":
                        topic_name = li['content'].strip()
                    if 'subtype' in li and li['subtype'] == 'paragraph':
                        paragraph = li['content'].strip()
                        # Replace <.*?> with ""
                        paragraph = re.sub(r'<.*?>', '', paragraph)
                        body += ' ' + paragraph
            # Recall By tf_idf
            body = body.strip()
            res_tfidf = set()
            for w in tfidf_mp[cur_id]:
                res_tfidf = res_tfidf | words_index[w]
            res_tfidf = list(res_tfidf)

            # Recall By topics
            res_topic = []
            if topic_name in res_topic:
                res_topic = list(topics_mp[topic_name])

            # Combie Recall results
            similar_doc = {}  # Filter
            cur_key = ''
            if title is not None:
                cur_key += title
            if author is not None:
                cur_key += '#' + author
            if date is None:
                cur_key += '#' + str(date)
            similar_doc[cur_key] = 1
            res_mask = {}
            res_mask[0] = set()
            res_mask[1] = set()
            res_mask[2] = set()
            res_mask[3] = set()
            res_tfidf_mp = {}  # help decide which is 8
            for li in res_tfidf:
                # Filter by kicker
                if li in tfidf_mp and filter_doc(WashingtonPost[li], date,
                                                 similar_doc):
                    res_mask[2].add(li)
                    res_tfidf_mp[li] = 1
            for li in res_topic:
                # Filter by kicker
                if li in tfidf_mp and filter_doc(WashingtonPost[li], date,
                                                 similar_doc):
                    if li in res_tfidf_mp:
                        res_mask[3].add(li)
                    else:
                        res_mask[1].add(li)

            # random add 100 label 0 document
            zero = np.random.randint(0, len(tfidf_mp), size=[100])
            for li in zero:
                doc_id = tfidf_list[li]
                if filter_doc(WashingtonPost[doc_id], date, similar_doc):
                    res_mask[0].add(doc_id)

            # split from body
            sen1 = split_body([body, max_length])

            # Sampling and Generate examples
            # label 0, 2, 4, 8
            for label in res_mask.keys():
                res_mask[label] = list(res_mask[label])
                if len(res_mask[label]) <= 0:
                    continue
                idx = random.randint(0, len(res_mask[label]) - 1)
                doc_id = res_mask[label][idx]
                doc_body = extract_body([WashingtonPost[doc_id]['contents']])
                sen2 = split_body([doc_body, max_length])
                out.write(str(label) + '\t' + sen1 + '\t' + sen2 + '\n')

            # label 16 middle from the body
            w_list = cfg.word_cut(body)
            st = (len(w_list) - max_length + 2) // 2
            ed = st + max_length - 2
            sen2 = ' '.join(w_list[st:ed])
            out.write(str(4) + '\t' + sen1 + '\t' + sen2 + '\n')
Пример #9
0
def test_entity_ranking():
    # stop words
    stop_words = {}
    with open('../elastic/stopwords.txt', 'r', encoding='utf-8') as f:
        for w in f:
            w = w[:-1]
            stop_words[w] = 1
    print('stop words loaded.')
    # test case: topic_id, list:[docid, entity_id]
    case_mp = {}
    with open(path_mp['DataPath'] + path_mp['entities19'],
              'r',
              encoding='utf-8') as f:
        li = []
        mp = {}
        topic_id = ''
        for line in f:
            topic_id_tmp = re.search(r'<num>.*?</num>', line)
            if topic_id_tmp is not None:
                if len(li) > 0:
                    case_mp[topic_id] = li
                    li = []
                topic_id = topic_id_tmp
                topic_id = topic_id.group(0)[5 + 9:-7]
            doc_id = re.search(r'<docid>.*?</docid>', line)
            if doc_id is not None:
                doc_id = doc_id.group(0)[7:-8]
                li.append(doc_id)
            entity_id = re.search(r'<id>.*?</id>', line)
            if entity_id is not None:
                entity_id = entity_id.group(0)[5:-6]
                mp['id'] = entity_id
            mention = re.search(r'<mention>.*?</mention>', line)
            if mention is not None:
                mention = mention.group(0)[9:-10]
                mp['mention'] = mention.lower()
            link = re.search(r'<link>.*?</link>', line)
            if link is not None:
                link = link.group(0)[6:-7]
                mp['link'] = link.lower()
                li.append(mp)
                mp = {}
        if len(li) != 0:
            case_mp[topic_id] = li
            li = []
    print('test case loaded.')
    with open('eresult_7191.test', 'w', encoding='utf-8') as f:
        # with open('/home/trec7/lianxiaoying/trec_eval.9.0/test/eresult.test', 'w', encoding='utf-8') as f:
        for topic_id in case_mp.keys():
            li = case_mp[topic_id]
            doc_id = li[0]
            out_doc_id = {'97b489e2-0a38-11e5-9e39-0db921c47b93': 1}
            doc = ''
            if doc_id not in out_doc_id:
                dsl = {'query': {'match': {'id': doc_id}}}
                res = es.search(index=INDEX_NAME, body=dsl)
                # print(res)
                doc = res['hits']['hits'][0]['_source']
            else:
                with open(doc_id + '.txt', 'r', encoding='utf-8') as rin:
                    for line in rin:
                        doc = json.loads(line)
                doc = process(doc)
            tmp1 = cfg.word_cut(doc['title_body'])
            tmp = []
            for w in tmp1:
                if w not in stop_words:
                    tmp.append(w)
            qr = ' '.join(tmp)
            # qr = doc['title_body']
            dsl = {
                "size": 1000,
                "timeout": "1m",
                "query": {
                    'bool': {
                        'must': {
                            'match': {
                                'body': {
                                    'query': qr,
                                    'boost': 1
                                }
                            }
                        }
                    }
                }
            }
            res = es.search(index=WIKI_INDEX, body=dsl, request_timeout=30)
            res = res['hits']['hits']
            inlink_to_rank = {}
            rank = 1
            for ri in res:
                inlink = ri['_source']['inlink']
                if inlink not in inlink_to_rank:
                    inlink_to_rank[inlink] = rank
                rank += 1
            cnt = 1
            for entity in li[1:]:
                print(entity['id'])
                out = []
                out.append(topic_id)
                out.append('Q0')
                out.append(entity['id'])
                out.append(str(cnt))
                sc = 0
                if entity['link'] in inlink_to_rank:
                    sc = 1000 - inlink_to_rank[entity['link']]
                out.append(str(sc))
                out.append('ICTNET_estem')
                ans = "\t".join(out) + "\n"
                f.write(ans)
                cnt += 1
Пример #10
0
MAX_IDF = 0.6

stop_words = {}
with open('../elastic/stopwords.txt', 'r', encoding='utf-8') as f:
    for w in f:
        w = w[:-1]
        stop_words[w] = 1
print('stop words loaded.')

idf = {}
N = 0
with open('/home/trec7/lianxiaoying/data/vector_corpus.txt',
          'r',
          encoding='utf-8') as f:
    for line in tqdm(f):
        w_list = cfg.word_cut(line[:-1])
        tf = {}
        for w in w_list:
            w = w.strip()
            if w not in stop_words and len(w) > 2:
                if w in tf:
                    tf[w] += 1
                else:
                    tf[w] = 1
        for w in tf.keys():
            # appear in one doc more than MIN_FREQ
            if tf[w] >= MIN_FREQ:
                if w not in idf:
                    idf[w] = 1
                else:
                    idf[w] += 1
Пример #11
0
def calc_doc_length(line):
    obj = json.loads(line)
    body = extract_body([obj['contents']])
    w_list = cfg.word_cut(body)
    return len(w_list)
Пример #12
0
def gen_res(args=None):
    SparkContext.getOrCreate().stop()
    conf = SparkConf().setMaster("local[*]").setAppName("bm25") \
     .set("spark.executor.memory", "10g") \
     .set("spark.driver.maxResultSize", "10g") \
     .set("spark.cores.max", 10) \
     .set("spark.executor.cores", 10) \
     .set("spark.default.parallelism", 20)
    sc = SparkContext(conf=conf)
    # stop words
    stop_words = {}
    with open('../elastic/stopwords.txt', 'r', encoding='utf-8') as f:
        for w in f:
            w = w[:-1]
            stop_words[w] = 1
    print('stop words loaded.')
    # words df
    words_df = sc.textFile(cfg.OUTPUT + 'words_index.txt') \
     .filter(lambda line: line != '') \
     .map(lambda line: (str(line.split(' ')[0]).lower(), len(line.split(' ')[1:]))) \
     .collectAsMap()
    words_df = sc.broadcast(words_df)
    print('words_df loaded.')
    # avgdl
    avgdl = sc.textFile(path_mp['DataPath'] + path_mp['WashingtonPost']) \
     .map(lambda line: calc_doc_length(line)).sum()
    avgdl = avgdl * 1.0 / 595037
    print('avgdl loaded.')
    # WashingtonPost
    WashingtonPost = sc.textFile(path_mp['DataPath'] + path_mp['WashingtonPost']) \
     .map(lambda line: return_doc(line)).collectAsMap()
    print('WashingtonPost loaded.')
    # test case
    case_mp = {}
    with open(path_mp['DataPath'] + path_mp['topics'], 'r',
              encoding='utf-8') as f:
        li = []
        for line in f:
            topic_id = re.search(r'<num>.*?</num>', line)
            if topic_id is not None:
                topic_id = topic_id.group(0)[5 + 9:-7]
                li.append(topic_id)
            doc_id = re.search(r'<docid>.*?</docid>', line)
            if doc_id is not None:
                doc_id = doc_id.group(0)[7:-8]
                li.append(doc_id)
            if len(li) == 2:
                case_mp[li[1]] = li[0]
                li = []
    print('test case loaded.')
    # filter and generate result
    with open('/home/trec7/lianxiaoying/trec_eval.9.0/test/bresult.test',
              'w',
              encoding='utf-8') as f:
        for cur_id in case_mp.keys():
            topic_id = case_mp[cur_id]
            print('now is processing:', topic_id)
            obj = WashingtonPost[cur_id]
            body = extract_body([obj['contents']])
            # query (modify)
            tmp1 = cfg.word_cut(str(obj['title'] + ' ' + body).lower())
            tmp = []
            for w in tmp1:
                if w not in stop_words:
                    tmp.append(w)
            query = tmp
            if len(tmp) > 768:
                query = tmp[:512] + tmp[-256:]
            res = bm25(sc, query, words_df, avgdl)
            # filter
            title = obj['title']
            author = obj['author']
            date = obj['published_date']
            similar_doc = {}
            cur_key = ''
            if title is not None:
                cur_key += title
            if author is not None:
                cur_key += '#' + author
            if date is None:
                cur_key += '#' + str(date)
            similar_doc[cur_key] = 1
            for score, doc_id in res:
                doc = WashingtonPost[doc_id]
                if filter_doc(doc, date, similar_doc):
                    out = []
                    out.append(topic_id)
                    out.append('Q0')
                    out.append(doc_id)
                    out.append(str(0))
                    out.append(str(score))
                    out.append('ICTNET')
                    f.write("\t".join(out) + "\n")