示例#1
0
 def count(self, query, index, document_type):
     query.build_query()
     resp, content = self.__request(
         '/%s/%s/_count/' % (index, document_type), 'GET',
         query.get_object_json())
     t = json.loads(content)
     return t['count']
def scrape_contribs(party, start_year, end_year=None, contribs_dir=None, get_address=True,
                    federal=True, riding=True, q_reports=False, summary=False):
    session = requests.Session()
    contribs = []
    if summary:
        contribs_dir += '/summaries'
        if not os.path.exists(contribs_dir):
            os.makedirs(contribs_dir)

    for year in range(start_year, end_year + 1):
        csvpath = (os.path.join(contribs_dir, '{}.{}.csv'.format(party, year))
                   if contribs_dir is not None else None)

        # run each search if they are explicitly enabled, or both if neither are
        if federal or not riding:
            print 'Getting federal party contributions for {} in {}'.format(party, year)
            queryid = build_query(session, party, True, year, q_reports)
            contribs.extend(search_contribs(session, queryid, True, year, get_address, csvpath, q_reports, summary))

        if riding or not federal:
            print 'Getting local riding association contributions for {} in {}'.format(party, year)
            queryid = build_query(session, party, False, year, q_reports)
            contribs.extend(search_contribs(session, queryid, False, year, get_address, csvpath, q_reports, summary))

    return contribs
示例#3
0
def scrape_contribs(party,
                    start_year,
                    end_year=None,
                    contribs_dir=None,
                    get_address=True,
                    federal=True,
                    riding=True):
    session = requests.Session()
    contribs = []

    for year in range(start_year, end_year + 1):
        csvpath = (os.path.join(contribs_dir, '{}.{}.csv'.format(party, year))
                   if contribs_dir is not None else None)

        # run each search if they are explicitly enabled, or both if neither are
        if federal or not riding:
            print 'Getting federal party contributions for {} in {}'.format(
                party, year)
            queryid = build_query(session, party, True, year)
            contribs.extend(
                search_contribs(session, queryid, True, year, get_address,
                                csvpath))

        if riding or not federal:
            print 'Getting local riding association contributions for {} in {}'.format(
                party, year)
            queryid = build_query(session, party, False, year)
            contribs.extend(
                search_contribs(session, queryid, False, year, get_address,
                                csvpath))

    return contribs
示例#4
0
res_doc_path = './res'
trec_eval_path = './trec_eval/trec_eval'
res_path = './eval/res.txt'
qrels_path = './eval/qrels.txt'
eval_path = './eval/eval.txt'

# 返回的相关词个数
k1 = 5
k2 = 15

if __name__ == '__main__':
    start = time.time()
    print('开始执行')
    # 构建查询
    print('根据文件' + query_path + '构建查询并作查询扩展')
    query_list = build_query(query_path, w2v_path, vocab_path, k1)
    print('构建查询完毕')
    # bm模型
    print("构建BM25模型")
    bm = BM25()
    print('构建BM25模型完毕')
    # 导入倒排表
    print('从' + invert_table_path + '处导入倒排表')
    bm.build(invert_table_path)
    print('导入完毕')
    # 查询
    print("开始查询")
    res = start_query(bm, query_list, k2)
    print('存储查询结果到目录' + res_doc_path)
    get_doc_cont(res, res_doc_path, doc_path)
    # 计算p@10
示例#5
0
 def query(self, query, index, document_type):
     query.build_query()
     resp, content = self.__request(
         '/%s/%s/_search/' % (index, document_type), 'GET',
         query.get_query())
     return DataCollection(content)
示例#6
0
 def count(self, query, index, document_type):
     query.build_query()
     resp, content = self.__request('/%s/%s/_count/' % (index, document_type), 'GET', query.get_object_json())
     t = json.loads(content)
     return t['count']
示例#7
0
 def query(self, query, index, document_type):
     query.build_query()
     resp, content = self.__request('/%s/%s/_search/' % (index, document_type), 'GET', query.get_query())
     return DataCollection(content)