예제 #1
0
    def return_data_from_mongodb(self,request):

        try:
            repo_id = request.session['repo_id']
            #repo_id=1
        except Exception:
            return self.error('没有知识库id')
        try:
            file_id = request.POST['file_id']
            #file_id  = 1
        except Exception:
            return self.error('没有文件id')
        tmp_info = {'file_id': file_id}
        try:
            news_col = Mongodb(db='knowledge', collection='text').get_collection()
        except Exception:
            return self.error("mongodb没有数据库或者表")

        ret_entity_map = news_col.find(tmp_info)
        ret_list = []
        for val in ret_entity_map:
            ret_list.append(val)
        category_name_list = []
        ret_category = TCategory.objects.filter(repo_id=repo_id)
        for val in ret_category:
            val_dict = model_to_dict(val)
            category_name_list.append(val_dict['category_name'])

        ret_l = {'category_name': category_name_list, 'context': ret_list}
        print(ret_l)
        return render(request, 'test1.html', context=ret_l)
예제 #2
0
    def save_mongodb_data_to_neo4j(self,request):
        try:
            #entity_id  = request.POST['entity_id']
            entity_id = ObjectId("5eb52fc9d03fe5b0f31b6f40")
        except Exception:
            return self.error("没有收到entity_id")
        try:
            #category_id = request.POST['category_id']
            category_id=1
        except Exception:
            return self.error("没有收到category_id")

        try:
            news_col = Mongodb(db='knowledge', collection='text').get_collection()
        except Exception:
            return self.error("mongodb没有数据库或者表")
        category_val =  TCategory.objects.get(id=category_id)
        category_val_dict = model_to_dict(category_val)
        category_name = category_val_dict['category_name']
        tmp_data={'_id':entity_id}
        ret_entity  = news_col.find(tmp_data)
        for val in ret_entity:
            print(category_name,val,1)
            Neo4j().create_node_mjy_edition(category_name,val)


        ret_l = {}
        return render(request, 'test1.html', context=ret_l)
예제 #3
0
 def get_data_source():
     """
     获取已获取的电影人url
     :return:
     """
     member_col = Mongodb(db='movies', collection='member').get_collection()
     url_set = set()
     for item in member_col.find():
         url_set.add(item["douban_url"])
     return url_set
예제 #4
0
 def __init__(self,
              isheadless=False,
              ismobile=False,
              isvirtualdisplay=False,
              isloadimages=True,
              isproxy=False,
              spider_id='2'):
     Driver.__init__(self,
                     log_file_name=spider_id,
                     ismobile=ismobile,
                     isvirtualdisplay=isvirtualdisplay,
                     isheadless=isheadless,
                     isloadimages=isloadimages,
                     isproxy=isproxy)
     self.collection = Mongodb(db='knowledge',
                               collection='text').get_collection()
예제 #5
0
 def __init__(self,
              isheadless=False,
              ismobile=False,
              isvirtualdisplay=False,
              isloadimages=True,
              isproxy=False,
              proxy_ip_from="",
              spider_id='2'):
     Driver.__init__(self,
                     log_file_name=spider_id,
                     ismobile=ismobile,
                     isvirtualdisplay=isvirtualdisplay,
                     isheadless=isheadless,
                     isloadimages=isloadimages,
                     isproxy=isproxy,
                     proxy_ip_from=proxy_ip_from)
     # self.baike_col = Mongodb(db='movies1', collection="baike_member").get_collection()
     self.baike_col = Mongodb(db='baike',
                              collection="test1").get_collection()
예제 #6
0
    def save_data_to_mongodb(self,request):
        #新建类目表
        #数据的话从文件里面读出
        repo_id = request.POST['repo_id']
        #create_id = request.POST['create_id']
        file_id = request.POST['file_id']

        try:
            news_col = Mongodb(db='knowledge', collection='text').get_collection()
        except Exception:
            return self.error("mongodb没有数据库或者表")

        try:
            ret_file_data = TDataAcquisitionLog.objects.get(id=file_id)
        except Exception:
            return  self.error("id没有对应文件")

        ret_file_data_dict = model_to_dict(ret_file_data)
        file_name = ret_file_data_dict['data_source']
        path_str =  ret_file_data_dict['data_access']
        try:
            data = xlrd.open_workbook(path_str + file_name)
        except Exception:
            return self.error("没有找到对应文件")

        table_name = data.sheet_names()[0]
        table = data.sheet_by_name(table_name)
        list_attribute = list(table.row_values(0))
        list_json = []
        row = table.nrows
        col = table.ncols

        for i in range(1, row):
            dict_data = {}
            for j in range(0, col):
                dict_data[list_attribute[j]] = table.row_values(i)[j]
            dict_data['file_id']=file_id
            x=news_col.insert_one(dict_data)

        ret_l={'context':'success'}

        return render(request, 'test1.html', context=ret_l)
예제 #7
0
 def GetStatistics(spider_id, repo_id):
     collection = Mongodb(db='knowledge',
                          collection='text').get_collection()
     count = collection.find({
         "spider_id": spider_id,
         "repo_id": repo_id
     }).count()
     # comment_count = comments_collection.find({FieldName.DATA_WEBSITE: str(project.data_website), FieldName.DATA_REGION: str(project.data_region), FieldName.DATA_SOURCE: str(project.data_source)}).count()
     # try:
     #     predict_comment_count = shops_collection.aggregate([{'$match': {FieldName.DATA_WEBSITE: str(project.data_website), FieldName.DATA_REGION: str(project.data_region), FieldName.DATA_SOURCE: str(project.data_source), FieldName.SHOP_COMMENT_NUM: {"$gt": 0}}}, {'$group': {"_id": "$%s"%FieldName.SHOP_URL, "num": {"$first": "$%s" % FieldName.SHOP_COMMENT_NUM}}}, {'$group': {"_id": None, "sum": {"$sum": "$num"}}}]).next().get('sum')
     # except Exception:
     #     predict_comment_count = 0
     curr_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
     count_today = collection.find({
         "spider_id": spider_id,
         "repo_id": repo_id,
         "value.crawl_time": {
             '$regex': curr_date
         }
     }).count()
     week_start = time.strftime('%Y-%m-%d %H:%M:%S',
                                time.localtime(time.time() - 7 * 24 * 3600))
     count_week = collection.find({
         "spider_id": spider_id,
         "repo_id": repo_id,
         "value.crawl_time": {
             '$gt': week_start
         }
     }).count()
     month_start = time.strftime(
         '%Y-%m-%d %H:%M:%S', time.localtime(time.time() - 30 * 24 * 3600))
     count_month = collection.find({
         "spider_id": spider_id,
         "repo_id": repo_id,
         "value.crawl_time": {
             '$gt': month_start
         }
     }).count()
     result = '数据:%6s条 今日:%6s条 本周:%6s条 本月:%s条' % (count, count_today,
                                                  count_week, count_month)
     return result
예제 #8
0
from model.mongodb import Mongodb
from bson import ObjectId

if __name__ == '__main__':
    test_col = Mongodb(db='test', collection='test').get_collection()
    # for i in test_col.find():
    #     if "file_id" in i:
    #         print(i["file_id"])
    #     test_col.update_one({"_id": i["_id"]}, {"$set": {"alexa": "12"}})
    test = test_col.find({"file_id": 1, "value.test1": {"$exists": True}})

    for item in test:
        print(item["_id"])
예제 #9
0
    def extract_relationship_from_unstructured_data(
            self, request, file_id, relationship_attribute_list=None):
        """
        从非结构化数据中抽取关系
        :param file_id:文件id,获取mongodb中对应要分析的数据
        :param relationship_attribute_list:关系属性列表,所有使用该算法的关系属性id集合
        :param request:
        :return:
        """
        print("------------------------非结构关系抽取")
        tmp_info = {
            'file_id': file_id,
            'user_id': request.session["user_id"],
            'repo_id': request.session["repo_id"]
        }
        collection = Mongodb(db='knowledge',
                             collection='text').get_collection()
        ret_entity = collection.find(tmp_info)
        ret_entity_map = list()
        for item in ret_entity:
            if "内容" in item["value"]:
                ret_entity_map.append(item)

        if len(ret_entity_map) == 0 or relationship_attribute_list is None:
            print("无可抽取内容")
            return
        relationship_list = []
        # all_category = TCategory.objects.filter(repo_id=request.session["repo_id"], create_id=request.session["user_id"], category_type=1)
        added_category_id = set()
        for attribute_id in relationship_attribute_list:
            cur_attribute = TAttribute.objects.get(id=attribute_id)
            category_from = TCategory.objects.get(id=cur_attribute.category_id)
            data_type = TDataType.objects.get(id=cur_attribute)
            category_to = TCategory.objects.get(id=data_type.category_id)

            category_from_name = BaseController.get_category_name(
                request, category_from.category_name)
            category_to_name = BaseController.get_category_name(
                request, category_to.category_name)

            one_relationship = list()
            one_relationship.append(cur_attribute.attribute_name)
            one_relationship.append(category_from_name)
            one_relationship.append(
                BaseController.get_category_name(request,
                                                 cur_attribute.attribute_name))
            one_relationship.append(category_to_name)
            relationship_list.append(one_relationship)
            self.hanlp_tool.add_word_list([{
                "word":
                alia_item.attribute_alias,
                "mask":
                BaseController.get_category_name(request,
                                                 cur_attribute.attribute_name)
            } for alia_item in TAttrbuteAlias.objects.filter(
                attribute_id=cur_attribute.id)])
            if category_from.id not in added_category_id:
                ret_list_id, ret_list_val = some_data_deal_func(
                ).inputCategoryIdReturnName(
                    categoryId=category_from.id,
                    repoId=request.session["repo_id"],
                    createId=request.session["user_id"])
                self.hanlp_tool.add_word_list([{
                    "word": val_item,
                    "mask": category_from_name
                } for val_item in ret_list_val])
                added_category_id.add(category_from.id)
            if category_to.id not in added_category_id:
                ret_list_id, ret_list_val = some_data_deal_func(
                ).inputCategoryIdReturnName(
                    categoryId=category_to.id,
                    repoId=request.session["repo_id"],
                    createId=request.session["user_id"])
                self.hanlp_tool.add_word_list([{
                    "word": val_item,
                    "mask": category_to_name
                } for val_item in ret_list_val])
                added_category_id.add(category_to.id)

        # for category_item in all_category:
        #     try:
        #         one_data_type = TDataType.objects.get(category_id=category_item.id, repo_id=request.session["repo_id"], create_id=request.session["user_id"])
        #         attribute_list = TAttribute.objects.filter(data_type_id=one_data_type.id)
        #         category_to_name = BaseController.get_category_name(request, category_item.category_name)
        #         for attribute_item in attribute_list:
        #             category_from = TCategory.objects.get(id=attribute_item.category_id)
        #             category_from_name = BaseController.get_category_name(request, category_from.category_name)
        #             one_relationship = list()
        #             one_relationship.append(attribute_item.attribute_name)
        #             one_relationship.append(category_from_name)
        #             one_relationship.append(BaseController.get_category_name(request, attribute_item.attribute_name))
        #             one_relationship.append(category_to_name)
        #             relationship_list.append(one_relationship)
        #             self.hanlp_tool.add_word_list([{"word": alia_item.attribute_alias,
        #                                            "mask": BaseController.get_category_name(request,
        #                                                                                     attribute_item.attribute_name)}
        #                                           for alia_item in
        #                                           TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)])
        #             print([{"word": alia_item.attribute_alias,
        #                                            "mask": BaseController.get_category_name(request,
        #                                                                                     attribute_item.attribute_name)}
        #                                           for alia_item in
        #                                           TAttrbuteAlias.objects.filter(attribute_id=attribute_item.id)])
        #             if category_from.id not in added_category_id:
        #                 ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName(categoryId=category_from.id, repoId=request.session["repo_id"], createId=request.session["user_id"])
        #                 self.hanlp_tool.add_word_list([{"word": val_item, "mask": category_from_name} for val_item in ret_list_val])
        #                 added_category_id.add(category_from.id)
        #         if category_item.id not in added_category_id:
        #             ret_list_id, ret_list_val = some_data_deal_func().inputCategoryIdReturnName(
        #                 categoryId=category_item.id, repoId=request.session["repo_id"],
        #                 createId=request.session["user_id"])
        #             self.hanlp_tool.add_word_list(
        #                 [{"word": val_item, "mask": category_to_name} for val_item in ret_list_val])
        #             added_category_id.add(category_item.id)
        #     except ObjectDoesNotExist:
        #         continue
        neo4j = Neo4j()
        cout = 0
        for i in ret_entity_map:
            _id = i['_id']
            value = i['value']
            content = value['内容']
            text = HanlpUnit().get_text_from_html(content)

            sentenceList = self.hanlp_tool.split_paragraph(text)
            extract_relationship = []
            for sent in sentenceList:
                sent = sent.strip()

                relationships = self.eventExtractionByTemplateMatching(
                    sent, relationship_list)
                # relationships = self.eventExtractionByTemplateMatching(text.strip(), relationship_list)
                for item in relationships:
                    relation_id = item[0]
                    cur_relationship = relationship_list[relation_id]

                    extract_relationship.append({
                        "object_from_category":
                        cur_relationship[1],
                        "object_to_category":
                        cur_relationship[3],
                        "object_from_name":
                        item[1],
                        "object_relationship_name":
                        item[2],
                        "object_to_name":
                        item[3]
                    })
                    object1 = neo4j.match(
                        object_from={
                            "label_name": cur_relationship[1],
                            "content": {
                                "名字": item[1]
                            }
                        })
                    object2 = neo4j.match(
                        object_from={
                            "label_name": cur_relationship[3],
                            "content": {
                                "名字": item[3]
                            }
                        })
                    if object1 is not None and len(
                            object1) == 1 and object2 is not None and len(
                                object2) == 1:
                        neo4j.createRelationship(labelOne=cur_relationship[1],
                                                 labelTwo=cur_relationship[3],
                                                 relationShipName=item[2],
                                                 propertyOne={"名字": item[1]},
                                                 propertyTwo={"名字": item[3]})
            if "relationship_extract_result" in i:
                extract_relationship = self.merge_list(
                    extract_relationship, i["relationship_extract_result"])
            cout += 1
            print(
                str(cout) + "个文章" + ",抽取数量:" + str(len(extract_relationship)))
            collection.update_one({"_id": ObjectId(_id)}, {
                "$set": {
                    "relationship_extract_result": extract_relationship
                }
            })
예제 #10
0
class MtimeSpider(Driver):
    def __init__(self,
                 isheadless=False,
                 ismobile=False,
                 isvirtualdisplay=False,
                 isloadimages=True,
                 isproxy=False,
                 spider_id='2'):
        Driver.__init__(self,
                        log_file_name=spider_id,
                        ismobile=ismobile,
                        isvirtualdisplay=isvirtualdisplay,
                        isheadless=isheadless,
                        isloadimages=isloadimages,
                        isproxy=isproxy)
        self.collection = Mongodb(db='knowledge',
                                  collection='text').get_collection()

    def get_news_from_one_page(self, ele=None):
        if ele is None:
            return None
        self.fast_click_page_by_elem(ele=ele)
        # self.fast_new_page(url)
        time.sleep(1)
        if self.judge_web_element_exist_by_css_selector(
                css_selector="p.newsinnerpageall > span > a"):
            show_all_page_btn = self.until_presence_of_element_located_by_css_selector(
                css_selector="p.newsinnerpageall > span > a")
            show_all_page_btn.click()
        try:
            news_title = self.until_presence_of_element_located_by_css_selector(
                css_selector="div.newsheader > div.newsheadtit").text
            news_time = re.findall(
                r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                self.until_presence_of_element_located_by_css_selector(
                    css_selector="div.newsheader > p.newstime").text)[0]
            news_source = self.until_presence_of_element_located_by_css_selector(
                css_selector="div.newsheader > p.newstime > span.ml15"
            ).text.split(":")[1]
            news_content = self.until_presence_of_element_located_by_css_selector(
                css_selector="div.newsnote").get_attribute(
                    'innerHTML'
                ) + self.until_presence_of_element_located_by_css_selector(
                    css_selector="div#newsContent").get_attribute("innerHTML")
            news_author = \
            self.until_presence_of_element_located_by_css_selector(css_selector="p.newsediter").text.split(
                ":")[1]
        except Exception:
            return None
        crwal_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        one_news = {}
        one_news.setdefault("标题", news_title)
        one_news.setdefault("时间", news_time)
        one_news.setdefault("来源", news_source)
        one_news.setdefault("内容", news_content)
        one_news.setdefault("作者", news_author)
        one_news.setdefault("crawl_from", self.get_current_url())
        one_news.setdefault("crwal_time", crwal_time)
        self.close_curr_page()
        return one_news

    def get_news_infos(self, spider_id, user_id, repo_id, spider_name):
        url = "http://news.mtime.com/movie/1/"
        self.fast_new_page(url=url)
        time.sleep(1)
        final_result = []
        flag = 0
        while True:
            while self.judge_web_element_exist_by_css_selector(
                    css_selector="div.newscontent > div#leftNews > a#viewmore"
            ):
                more_info_btn = self.until_presence_of_element_located_by_css_selector(
                    css_selector="div.newscontent > div#leftNews > a#viewmore")
                self.scroll_to_center(more_info_btn)
                more_info_btn.click()
                time.sleep(1)
            news_list = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector="ul#newslist > li")
            for item in news_list:
                one_news = self.get_news_from_one_page(ele=item)
                if one_news is None:
                    continue
                print(one_news)
                judge_result = self.judge_data_exist_by_keys(
                    collection=self.collection,
                    keys={
                        "user_id": user_id,
                        "repo_id": repo_id,
                        "value.crawl_from": one_news["crawl_from"]
                    })
                if judge_result:
                    final_result.append(one_news)
                else:
                    flag = 1
                    break
            if flag == 1 or not self.judge_web_element_exist_by_css_selector(
                    css_selector="div#pages > a.cur + a"):
                break
            else:
                next_page_btn = self.until_presence_of_element_located_by_css_selector(
                    css_selector="div#pages > a.cur + a")
                self.fast_click_page_by_elem(ele=next_page_btn)
                time.sleep(1)
        if len(final_result) == 0:
            return
        one_data_acquisition_log = TDataAcquisitionLog.objects.create(
            create_time=timezone.now(),
            data_source_name=spider_name,
            data_access="爬虫",
            repo_id=int(repo_id),
            create_id=int(user_id),
            data_path="")
        TEntityExtractionLog.objects.create(
            data_acquisition_id=one_data_acquisition_log.id,
            is_extract=0,
            entity_number=0,
            extract_time=timezone.now(),
            create_id=int(user_id),
            repo_id=int(repo_id))

        for item in final_result:
            self.collection.insert_one({
                "file_id": one_data_acquisition_log.id,
                "category_id": -1,
                "spider_id": int(spider_id),
                "user_id": int(user_id),
                "repo_id": int(repo_id),
                "value": item
            })
예제 #11
0
from model.mongodb import Mongodb
import json
from bson import json_util, objectid

a = b'{"_id":{"$oid":"5dfc557411647d60345088a3"},"datetime":"2011-01-01","movie_name":"\xe8\xae\xa9\xe5\xad\x90\xe5\xbc\xb9\xe9\xa3\x9e","release_time":"\xe4\xb8\x8a\xe6\x98\xa017\xe5\xa4\xa9","crawl_from":"\xe7\x8c\xab\xe7\x9c\xbc\xe4\xb8\x93\xe4\xb8\x9a\xe7\x89\x88","crawl_time":"2019-12-20 13:00:36","boxoffice_ratio":"47.7%","screenings_number":"11587","screenings_ratio":"37.3%","field_trips":"0","attendance_rate":"--","boxoffice_statistics":"3008.13","total_boxoffice":"5.20\xe4\xba\xbf"}\n'
a = json.loads(a, object_hook=json_util.object_hook)
print(type(a["_id"]))
print(isinstance(a["_id"], objectid.ObjectId))
Mongodb(db="test1", collection="test1").get_collection().insert_one(a)

예제 #12
0
class DoubanSpider(Driver):
    # 爬取电影人的豆瓣url集合,用以筛去所有重复的url
    member_set = set()

    def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False,
                 proxy_ip_from="", spider_id='2', data_queue=None):
        Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay,
                        isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy,
                        proxy_ip_from=proxy_ip_from)
        self.movie_col = Mongodb(db='knowledge', collection='text').get_collection()
        # self.member_col = Mongodb(db='movies', collection='member').get_collection()
        # self.comment_col = Mongodb(db='movies', collection="comments").get_collection()

    def get_member_info(self, url=""):
        """
        获取一个电影人的具体个人信息
        :param url:
        :return:
        """
        self.fast_new_page(url=url)
        if "条目不存在" in self.driver.title or "页面不存在" in self.driver.title:
            self.close_curr_page()
            return None
        name = self.driver.title[:-4].strip()
        member_data = {}
        member_data.setdefault("member_name", name)
        member_data.setdefault("douban_url", url)
        member_div_infos = self.until_presence_of_all_elements_located_by_css_selector("div.info > ul > li")
        for item in member_div_infos:
            item = item.text.split(":")
            key = item[0].strip()
            if len(item) > 2:
                value = ":".join(item[1:])
            else:
                value = item[1]
            if key == "性别" or key == "星座" or key == "出生日期" or key == "出生地" or key == "官方网站":
                member_data.setdefault(key, value.strip())
            else:
                member_data.setdefault(key, [item.strip() for item in value.split("/")])
        self.close_curr_page()
        return member_data
        # self.member_col.insert_one(member_data)
        # self.info_log(data="取得个人资料数据----" + member_data["member_name"])
        # return True

    def get_member_awards(self, url=""):
        """
        获取一个电影人曾经获得的所有荣誉
        :param url:
        :return:
        """
        self.fast_new_page(url=url)
        awards_div = self.until_presence_of_element_located_by_css_selector("div.grid-16-8.clearfix > div.article")
        result = []
        try:
            awards_info = self.until_presence_of_all_elements_located_by_css_selector(css_selector="div.awards", ele=awards_div, timeout=5)
        except Exception:
            self.close_curr_page()
            return result
        for temp in awards_info:
            awards_time = self.until_presence_of_element_located_by_css_selector(css_selector="div.hd > h2", ele=temp)
            awards = self.until_presence_of_all_elements_located_by_css_selector(css_selector="ul.award", ele=temp)
            for award in awards:
                data = {}
                award_info = self.until_presence_of_all_elements_located_by_css_selector(css_selector="li", ele=award)
                data.setdefault("time", awards_time.text)
                data.setdefault("award_from", award_info[0].text)
                data.setdefault("award", award_info[1].text)
                data.setdefault("relevant_movie", award_info[2].text)
                result.append(data)
        self.close_curr_page()
        return result

    def get_member_movies(self, url=""):
        """
        获取一个电影人参与过的所有电影列表
        :param url:
        :return:
        """
        movies = []
        self.fast_new_page(url=url)
        while True:
            movies_a = self.until_presence_of_all_elements_located_by_css_selector("div.article > div.grid_view > ul > li > dl > dd > h6 > a")
            for temp in movies_a:
                movies.append(temp.text)
            try:
                self.vertical_scroll_to()
                next_page = self.until_presence_of_element_located_by_css_selector("div.article > div.paginator > span.next > a", timeout=5)
                next_page.click()
                time.sleep(1)
            except Exception:
                self.close_curr_page()
                return movies

    def get_comments(self, url="", movie_name="", movie_id=None):
        """
        获取单页的20条评论信息
        :param url:
        :param movie_name:
        :return:
        """
        self.fast_new_page(url=url)
        if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title:
            self.close_curr_page()
            return
        comments_list = self.until_presence_of_all_elements_located_by_css_selector("div.article > div#comments.mod-bd > div.comment-item")
        if not self.judge_web_element_exist_by_css_selector(ele=comments_list[0], css_selector="div.comment"):
            self.close_curr_page()
            return
        for temp in comments_list:
            self.scroll_to_center(temp)
            data = {}
            commenter_name = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-info > a", ele=temp)
            commenter_useful = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-vote > span.votes", ele=temp)
            comment_content = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > p > span.short", ele=temp)
            comment_time = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-info > span.comment-time", ele=temp)
            data.setdefault("movie_name", movie_name)
            data.setdefault("nickname", commenter_name.text)
            data.setdefault("useful", commenter_useful.text)
            data.setdefault("time", comment_time.text)
            data.setdefault("content", comment_content.text)
            data.setdefault("comment_from", "douban.com")
            if movie_id is not None:
                data.setdefault("movie_id", movie_id)
            if self.judge_web_element_exist_by_css_selector(ele=temp, css_selector="div.comment > h3 > span.comment-info > span.rating"):
                commenter_evaluate = self.until_presence_of_element_located_by_css_selector(
                    css_selector="div.comment > h3 > span.comment-info > span.rating", ele=temp)
                data.setdefault("evaluate", commenter_evaluate.get_attribute("title"))
            else:
                data.setdefault("evaluate", "")
            # self.comment_col.insert_one(data)
        self.close_curr_page()

    def get_one_movie_info(self, ele=None):
        """
        获取电影详细数据
        :param url:
        :return:
        """
        self.fast_click_page_by_elem(ele=ele)
        time.sleep(1)
        # self.fast_new_page(url=url)
        if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title:
            self.close_curr_page()
            return None
        try:
            actor_more = self.driver.find_element_by_css_selector("div#info > span.actor > span.attrs > a.more-actor")
            actor_more.click()
            mask = 1
        except Exception:
            mask = 0
        div_info = self.until_presence_of_element_located_by_css_selector(css_selector="div#info")
        infos = div_info.text
        info_list = infos.split("\n")
        movie_info = {}
        for info in info_list:
            info = info.split(":")
            key = info[0].strip()
            if len(info) == 1 or (len(info) == 2 and info[1] == ""):
                continue
            elif len(info) > 2:
                value = ":".join(info[1:])
            else:
                value = info[1]
            if key == "官方网站":
                movie_info.setdefault(key, value.strip())
            else:
                movie_info.setdefault(key, [item.strip() for item in value.split("/")])
        # member_link = self.until_presence_of_all_elements_located_by_css_selector(css_selector="span span.attrs a",
        #                                                                     ele=div_info)
        # if mask == 1:
        #     member_link = member_link[:-1]
        # for item in member_link:
        #     item_link = item.get_attribute("href")
        #     if item_link in self.member_set:
        #         continue
        #     self.member_set.add(item_link)
        #     actor_info = {"member_name": item.text, "douban_url": item_link}
        #     self.dataQueue.put(actor_info)
        # self.close_curr_page()
        comment1 = self.until_presence_of_element_located_by_css_selector(
            "div#comments-section > div.mod-hd > h2 > span.pl > a")
        comment2 = self.until_presence_of_element_located_by_css_selector(
            "section#reviews-wrapper > header > h2 > span.pl > a")
        comment_number = int(re.findall(r'\d+', comment1.text)[0]) + int(re.findall(r'\d+', comment2.text)[0])
        movie_info.setdefault("豆瓣评论数量", comment_number)
        self.close_curr_page()
        return movie_info

    def get_movie_infos(self, spider_id, user_id, repo_id, spider_name):
        self.fast_new_page(
            url="https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0")
        self.driver.refresh()
        if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title:
            self.close_curr_page()
            return None
        # category_ul = self.until_presence_of_element_located_by_css_selector("ul.category")
        # category = self.until_presence_of_all_elements_located_by_css_selector(css_selector="li", ele=category_ul)[5:]
        # cur = 0
        # description = category[cur].text
        # category[cur].click()
        time.sleep(1)
        css_selector = "div.list-wp a.item"
        elements_list = self.until_presence_of_all_elements_located_by_css_selector(css_selector=css_selector)
        final_result = []
        for each in elements_list:
            data = {}
            self.vertical_scroll_to()
            time.sleep(1)
            self.scroll_to_center(ele=each)
            movie_link = each.get_attribute("href")
            movie_name = self.until_presence_of_element_located_by_css_selector(ele=each,
                                                                                css_selector="div.cover-wp > img")
            movie_score = self.until_presence_of_element_located_by_css_selector(ele=each,
                                                                                 css_selector="p > strong")
            data.setdefault("电影名", movie_name.get_attribute("alt"))
            data.setdefault("豆瓣评分", movie_score.text)
            crwal_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            data.setdefault("crawl_from", movie_link)
            data.setdefault("crawl_time", crwal_time)
            movie_info = self.get_one_movie_info(ele=each)
            movie_info.update(data)
            print(movie_info)
            final_result.append(movie_info)

        if len(final_result) == 0:
            return
        one_data_acquisition_log = TDataAcquisitionLog.objects.create(create_time=timezone.now(),
                                                                      data_source_name=spider_name,
                                                                      data_access="爬虫",
                                                                      repo_id=int(repo_id),
                                                                      create_id=int(user_id),
                                                                      data_path="")
        TEntityExtractionLog.objects.create(data_acquisition_id=one_data_acquisition_log.id, is_extract=0,
                                            entity_number=0, extract_time=timezone.now(), create_id=int(user_id),
                                            repo_id=int(repo_id))

        for item in final_result:
            judge_result = self.judge_data_exist_by_keys(collection=self.movie_col,
                                                         keys={"user_id": user_id, "repo_id": repo_id,
                                                               "value.电影名": item["电影名"],
                                                               "value.crawl_from": item["crawl_from"]})
            if judge_result is True:
                self.movie_col.insert_one(
                    {"file_id": one_data_acquisition_log.id, "category_id": -1, "spider_id": int(spider_id),
                     "user_id": int(user_id), "repo_id": int(repo_id), "value": item})

    # def run(self):
    #     """
    #     单个线程启动方法,对每一个队列中的数据的url进行解析,找到对应的方法进行爬取对应数据
    #     :return:
    #     """
    #     self.info_log(data="线程启动", name=self.name)
    #     count = 0
    #     while not self.dataQueue.empty() and count == 0:
    #         temp = self.dataQueue.get(False)
    #         url_path = urlparse(temp["douban_url"]).path
    #         while True:
    #             try:
    #                 if "/celebrity" in url_path:
    #                     # 获取一条电影人详细数据
    #                     member_info = self.get_member_info(temp["douban_url"])
    #                     if member_info is None:
    #                         print("人物数据不存在")
    #                         break
    #                     member_awards = self.get_member_awards(temp["douban_url"] + "awards")
    #                     member_movies = self.get_member_movies(temp["douban_url"] + "movies")
    #                     member_info.setdefault("awards", member_awards)
    #                     member_info.setdefault("acting_movies", member_movies)
    #                     self.member_col.insert_one(member_info)
    #                     self.info_log(data="成功获取并存储一条人物数据-----" + member_info["member_name"], name=self.threadName)
    #                 elif "/subject" in url_path and "/subject_search" not in url_path and "/comments" not in url_path:
    #                     # 获取一条电影数据,成功获取电影数据后将他的影评url数据压入队列
    #                     movie_info = self.get_movie_info(temp["douban_url"])
    #                     if movie_info is None:
    #                         print("电影数据不存在")
    #                         break
    #                     movie_info.update(temp)
    #                     self.movie_col.insert_one(movie_info)
    #                     self.info_log(data="成功获取并存储一条电影数据-----" + movie_info["movie_name"], name=self.threadName)
    #                     print(movie_info)
    #                     comments_url = temp["douban_url"] + "comments?start=0&limit=20&sort=new_score&status=P"
    #                     self.dataQueue.put({"movie_name": temp["movie_name"], "douban_url": comments_url, "movie_id": movie_info["_id"]})
    #                 elif "/subject" in url_path and "/comments" in url_path:
    #                     # 对url解析,爬取200条影评数据
    #                     bits = list(urlparse(temp["douban_url"]))
    #                     qs = parse_qs(bits[4])
    #                     start = int(qs["start"][0])
    #                     while start <= 200:
    #                         qs["start"][0] = start
    #                         bits[4] = urlencode(qs, True)
    #                         temp["douban_url"] = urlunparse(bits)
    #                         self.get_comments(temp["douban_url"], temp["movie_name"], temp["movie_id"])
    #                         start += 20
    #                 count = 0
    #                 break
    #             except Exception:
    #                 # 累计失败次数,每次失败后更换换代理ip,若连续失败5次则线程结束
    #                 count += 1
    #                 if count > 5:
    #                     self.dataQueue.put(temp)
    #                     break
    #                 self.change_ip(self.get_ip(self.proxy_ip_from))

    @staticmethod
    def get_data_source():
        """
        获取已获取的电影人url
        :return:
        """
        member_col = Mongodb(db='movies', collection='member').get_collection()
        url_set = set()
        for item in member_col.find():
            url_set.add(item["douban_url"])
        return url_set
예제 #13
0
class MaoyanSpider(Driver):
    def __init__(self,
                 isheadless=False,
                 ismobile=False,
                 isvirtualdisplay=False,
                 isloadimages=True,
                 isproxy=False,
                 spider_id='2'):
        Driver.__init__(self,
                        log_file_name=spider_id,
                        ismobile=ismobile,
                        isvirtualdisplay=isvirtualdisplay,
                        isheadless=isheadless,
                        isloadimages=isloadimages,
                        isproxy=isproxy)
        self.boxoffice_col = Mongodb(db='knowledge',
                                     collection='text').get_collection()
        self.news_col = Mongodb(db='movies1',
                                collection='news').get_collection()

    @staticmethod
    def find_key_from_value(dict, value):
        key_list = dict.keys()
        for key in key_list:
            if value == dict[key]:
                return key
        return None

    def get_boxoffice_infos_from_one_page(self,
                                          url="",
                                          datetime="",
                                          user_id=-1,
                                          repo_id=-1):
        """
        获取猫眼此时刻票房数据
        :param repo_id:
        :param user_id:
        :param datetime:
        :param url:
        :return:
        """
        self.fast_new_page(url=url)
        time.sleep(1)
        if not self.judge_web_element_exist_by_css_selector(
                css_selector="div.dashboard-content"):
            self.close_curr_page()
            return True
        theads = self.until_presence_of_all_elements_located_by_css_selector(
            css_selector=
            "div.dashboard-list > table.dashboard-table.table-header > thead > tr > th"
        )[1:]
        theads = [item.text for item in theads]
        if not self.judge_web_element_exist_by_css_selector(
                css_selector=
                "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr"
        ):
            self.close_curr_page()
            return False
        boxoffice_infos = self.until_presence_of_all_elements_located_by_css_selector(
            css_selector=
            "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr"
        )
        crwal_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        boxoffice_data_from_the_page = []
        for item in boxoffice_infos:
            one_boxoffice_data = {}
            boxoffice_info = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector="td", ele=item)
            movie_name = self.until_presence_of_element_located_by_css_selector(
                css_selector="div > div.moviename-desc > p.moviename-name",
                ele=boxoffice_info[0])
            movie_info = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector=
                "div > div.moviename-desc > p.moviename-info > span",
                ele=boxoffice_info[0])
            one_boxoffice_data.setdefault("日期", datetime)
            one_boxoffice_data.setdefault("电影名", movie_name.text)
            one_boxoffice_data.setdefault("上映时间", movie_info[0].text)
            one_boxoffice_data.setdefault("总票房", movie_info[1].text)
            boxoffice_info = boxoffice_info[1:]
            for i in range(len(boxoffice_info)):
                one_boxoffice_data.setdefault(theads[i],
                                              boxoffice_info[i].text)
            one_boxoffice_data.setdefault("crawl_time", crwal_time)
            one_boxoffice_data.setdefault("crawl_from", "猫眼专业版")
            # self.piaofang_col.insert_one(one_piaofang_data)
            judge_result = self.judge_data_exist_by_keys(
                collection=self.boxoffice_col,
                keys={
                    "user_id": user_id,
                    "repo_id": repo_id,
                    "value.日期": one_boxoffice_data["日期"],
                    "value.电影名": one_boxoffice_data["电影名"],
                    "value.crawl_from": one_boxoffice_data["crawl_from"]
                })
            if judge_result is True:
                boxoffice_data_from_the_page.append(one_boxoffice_data)
            else:
                return boxoffice_data_from_the_page, False

        self.close_curr_page()
        return boxoffice_data_from_the_page, True

    def get_boxoffice_infos(self, spider_id, user_id, repo_id, spider_name):
        date = datetime.datetime.strptime("2020-01-23", '%Y-%m-%d')
        # date = datetime.datetime.now()
        final_result = []
        while True:
            data_list, result = self.get_boxoffice_infos_from_one_page(
                url="http://piaofang.maoyan.com/dashboard/movie?date=" +
                str(date)[:10],
                datetime=str(date)[:10],
                user_id=int(user_id),
                repo_id=int(repo_id))
            final_result.extend(data_list)
            if result is False:
                break
            date = date + datetime.timedelta(days=-1)
        if len(final_result) == 0:
            return
        one_data_acquisition_log = TDataAcquisitionLog.objects.create(
            create_time=timezone.now(),
            data_source_name=spider_name,
            data_access="爬虫",
            repo_id=int(repo_id),
            create_id=int(user_id),
            data_path="")
        TEntityExtractionLog.objects.create(
            data_acquisition_id=one_data_acquisition_log.id,
            is_extract=0,
            entity_number=0,
            extract_time=timezone.now(),
            create_id=int(user_id),
            repo_id=int(repo_id))

        for item in final_result:
            self.boxoffice_col.insert_one({
                "file_id": one_data_acquisition_log.id,
                "category_id": -1,
                "spider_id": int(spider_id),
                "user_id": int(user_id),
                "repo_id": int(repo_id),
                "value": item
            })

    def run_spider(self, url=""):
        lastest_info = self.boxoffice_col.find().sort("datetime", -1).limit(1)
        date = datetime.datetime.strptime(lastest_info[0]["datetime"],
                                          '%Y-%m-%d')
        date = date + datetime.timedelta(days=1)
        now = datetime.datetime.now()
        while date < now:
            self.get_boxoffice_infos_from_one_page(
                "http://piaofang.maoyan.com/dashboard/movie?date=" +
                str(date)[:10],
                str(date)[:10])
            date = date + datetime.timedelta(days=1)
예제 #14
0
    def update_t_mapping_rule(self, repo_id, create_id):
        #显然是要对每个类目进行计算
        return_category = TCategory.objects.filter(repo_id=repo_id,
                                                   create_id=create_id)

        category_name_list = []
        category_id_list = []
        for val in return_category:
            val_dict = model_to_dict(val)
            category_name_list.append(val_dict['category_name'])
            category_id_list.append(val_dict['id'])

        list_len = len(category_name_list)
        #print(list_len)
        #print(category_name_list)
        #print(category_id_list)
        for i in range(0, list_len):
            tmp_id = category_id_list[i]
            attribute_name_alias_map = {}
            return_attribute = TAttribute.objects.filter(category_id=tmp_id)

            #把这个函数封装一下不然就太长了
            #输入是查询的所有attribute 返回一个map
            #attribute 不仅是自己的attribute 还有父亲节点的attribute
            #这边这个attribute_name_alias_map里面还要放进去他们夫妻节点的东西
            attribute_name_alias_map = self.return_attribute_name_map(
                return_attribute, attribute_name_alias_map)
            #print(attribute_name_alias_map)

            ret_cate = TCategory.objects.get(id=tmp_id)
            ret_cate_dict = model_to_dict(ret_cate)
            father_category_id = ret_cate_dict['father_category_id']
            #print(father_category_id,type(father_category_id))
            if (str(-1) != father_category_id):
                return_attribute_father = TAttribute.objects.filter(
                    category_id=father_category_id)
                attribute_name_alias_map = self.return_attribute_name_map(
                    return_attribute_father, attribute_name_alias_map)
                ret_cate_father = TCategory.objects.get(id=father_category_id)
                ret_cate_dict_father = model_to_dict(ret_cate_father)
                father_father_category_id = ret_cate_dict_father[
                    'father_category_id']
                #print(father_father_category_id)
                if (str(-1) != father_father_category_id):
                    return_attribute_father_father = TAttribute.objects.filter(
                        category_id=father_father_category_id)
                    attribute_name_alias_map = self.return_attribute_name_map(
                        return_attribute_father_father,
                        attribute_name_alias_map)

            #print(attribute_name_alias_map)

            news_col = Mongodb(db='knowledge',
                               collection='text').get_collection()
            #print(list_len)

            # 这边所有的名字已经在attribute_name_alias_map里面
            _insert_mapping_rule_attribute_name_list = []
            _insert_mapping_rule_attribute_coverage_rate_list = []
            category_id = category_id_list[i]
            #从mongodb里面找
            attribute_name_map = {}
            tmp_list = news_col.find({'category_id': category_id})
            num = 0
            for val in tmp_list:
                #print(val)
                num += 1
                if val is not None:
                    for key in val.keys():
                        #print(key)
                        #print(attribute_name_map)
                        #print(key in attribute_name_map )
                        if (key == '_id' or key == 'file_id'
                                or key == 'category_id'):
                            continue
                        elif (key in attribute_name_map):

                            attribute_name_map[key] += 1
                        else:
                            attribute_name_map[key] = 1
            #这边这个在没有实体的时候 假如说t_mapping_rule里面有多余的值那么就要进行更新
            #所以只要有一个就不用删除一旦一个都没有了 那么就不用删除
            #如果你想写得再细致一点那么就确认这个属性还在 假如说不在了那么就删除
            #
            delete_id_list = []
            #print(111)

            return_mapping_rule = TMappingRule.objects.filter(
                category_id=category_id, create_id=create_id)
            #print(111)
            for rule in return_mapping_rule:

                rule_dict = model_to_dict(rule)
                print(rule_dict)
                rule_dict_id = rule_dict['id']
                rule_dict_attribute_name = rule_dict['attribute_name']
                if rule_dict_attribute_name not in attribute_name_map.keys():
                    delete_id_list.append(rule_dict_id)
            for mapping_rule_id in delete_id_list:
                #其实删除的话最好存到日志里面不然又会出问题
                rule_mapping = TMappingRule.objects.get(id=mapping_rule_id)
                rule_mapping.delete()
            #
            #print(attribute_name_map)
            if attribute_name_map is not None:
                for key in attribute_name_map.keys():
                    if (key in attribute_name_alias_map):
                        a = 1
                    else:
                        _insert_mapping_rule_attribute_name_list.append(key)
                        coverage_rate = 1.0 * attribute_name_map[key] / num
                        _insert_mapping_rule_attribute_coverage_rate_list.append(
                            coverage_rate)
            attribute_name_list_len = len(
                _insert_mapping_rule_attribute_name_list)
            dt = datetime.now()
            #print(attribute_name_list_len)
            #print(_insert_mapping_rule_attribute_name_list)
            for k in range(0, attribute_name_list_len):
                attribute_name_val = _insert_mapping_rule_attribute_name_list[
                    k]
                attribute_coverage_val = _insert_mapping_rule_attribute_coverage_rate_list[
                    k]
                obj = TMappingRule.objects.filter(
                    attribute_name=attribute_name_val,
                    create_id=create_id).first()
                #print(attribute_name_val,attribute_coverage_val)
                if (obj is None):
                    # create
                    TMappingRule.objects.create(
                        attribute_name=attribute_name_val,
                        coverage_rate=attribute_coverage_val,
                        create_time=str(dt)[:19],
                        category_id=category_id,
                        create_id=create_id)
                else:
                    # upadte
                    obj.coverage_rate = attribute_coverage_val
                    obj.create_time = str(dt)[:19]
                    obj.save()
        return 1
예제 #15
0
class BaikeSpider(Driver):
    urls = []
    # tags = ["电影", "演员", "导演", "编剧", "制片人"]
    count = 0

    def __init__(self,
                 isheadless=False,
                 ismobile=False,
                 isvirtualdisplay=False,
                 isloadimages=True,
                 isproxy=False,
                 proxy_ip_from="",
                 spider_id='2'):
        Driver.__init__(self,
                        log_file_name=spider_id,
                        ismobile=ismobile,
                        isvirtualdisplay=isvirtualdisplay,
                        isheadless=isheadless,
                        isloadimages=isloadimages,
                        isproxy=isproxy,
                        proxy_ip_from=proxy_ip_from)
        # self.baike_col = Mongodb(db='movies1', collection="baike_member").get_collection()
        self.baike_col = Mongodb(db='baike',
                                 collection="test1").get_collection()

    def get_infos(self, url="", extensive_properties=None):
        if extensive_properties is None:
            extensive_properties = {}
        self.fast_new_page(url=url)
        relationship_urls = []
        relationship_tags = []
        if self.judge_web_element_exist_by_css_selector(
                css_selector=
                "div.polysemantList-header-title > div.toggle.expand"):
            synonym = self.until_presence_of_element_located_by_css_selector(
                css_selector=
                "div.polysemantList-header-title > div.toggle.expand > a")
            self.scroll_to_center(synonym)
            synonym.click()
            member_urls = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector=
                "ul.polysemantList-wrapper.cmn-clearfix > li.item > a")
            for item in member_urls:
                # for tag in self.tags:
                #     if tag in item.text:
                relationship_urls.append(item.get_attribute("href"))
                relationship_tags.append(item.text)
                # break
        if self.driver.current_url not in self.urls:
            data = self.get_base_info_from_baike()
            if data is not None:
                current_tag = self.until_presence_of_element_located_by_css_selector(
                    css_selector=
                    "ul.polysemantList-wrapper.cmn-clearfix > li.item > span.selected"
                )
                data.setdefault("tag", current_tag.text)
                data.update(extensive_properties)
                print(data)
                self.baike_col.insert_one(data)
                self.urls.append(self.driver.current_url)
            self.close_curr_page()

        for item in relationship_urls:
            if item not in self.urls:
                self.fast_new_page(url=item)
                data = self.get_base_info_from_baike()
                if data is not None:
                    data.setdefault(
                        "tag",
                        relationship_tags[relationship_urls.index(item)])
                    data.update(extensive_properties)
                    print(data)
                    self.baike_col.insert_one(data)
                    self.urls.append(item)
                self.close_curr_page()
        if self.count == 10:
            return False
        return True

    def get_base_info_from_baike(self):
        try:
            if not self.judge_web_element_exist_by_css_selector(
                    css_selector=
                    "div.content > div.main-content div.basic-info.cmn-clearfix"
            ):
                return
            basic_info_div = self.until_presence_of_element_located_by_css_selector(
                css_selector=
                "div.content > div.main-content div.basic-info.cmn-clearfix")

            if self.judge_web_element_exist_by_css_selector(
                    ele=basic_info_div, css_selector="a.toggle.toExpand"):
                btn = self.until_presence_of_element_located_by_css_selector(
                    ele=basic_info_div, css_selector="a.toggle.toExpand")
                self.scroll_to_center(btn)
                btn.click()

            basic_info_name = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector="dl > dt.basicInfo-item.name", ele=basic_info_div)
            basic_info_value = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector="dl > dd.basicInfo-item.value",
                ele=basic_info_div)
            data = {}
            for i in range(len(basic_info_name)):
                name = basic_info_name[i].text.replace(" ", "")
                value = basic_info_value[i].text
                if name == "" or value.replace(" ", "") == "":
                    continue
                data.setdefault(name, value)
            data.setdefault("url", self.driver.current_url)
            if self.judge_web_element_exist_by_css_selector(
                    css_selector="div.lemma-summary"):
                base_infos = self.until_presence_of_element_located_by_css_selector(
                    css_selector="div.lemma-summary").text
                data.setdefault("基础信息", base_infos)
            self.count = 0
            return data
        except Exception:
            self.count += 1
예제 #16
0
    def eventExtraction(self, request, file_id, lEventCategoryId):
        """
        功能 进行模板匹配的事件抽取
        :param request:                          request参数
        :param file_id:             数据类型str  文件id
        :param lEventCategoryId:    数据类型list 事件类目id
        :return: True
        """
        #加入ruleId 1或者2
        #1的事件是三元组主谓宾 2的话变事件是主谓
        #only for debug
        #request.session['user_id'] = 1
        #request.session['repo_id'] = 1
        #fileId = 13
        #only for debug

        #fileId = request.POST['fileId']
        #request.session['repo_id']=1
        #request.session['user_id']=1
        repoId = request.session['repo_id']
        createId = request.session['user_id']

        #存到这个file_id 里面
        tmp_info = {'file_id': file_id, 'user_id': createId, 'repo_id': repoId}
        news_col = Mongodb(db='knowledge', collection='text').get_collection()
        cnt = 1
        ret_entity = news_col.find(tmp_info)
        ret_entity_map = list()
        for item in ret_entity:
            if "内容" in item["value"]:
                ret_entity_map.append(item)

        if len(ret_entity_map) == 0:
            return

        print("--------------------事件抽取")
        #在这个之前把所有的词语都加进去
        #整个循环都是为了把这个repoId的所有的触发词以及他们的事件主题客体都加入进去
        retTriggerWordList = TTriggerWord.objects.filter(repo_id=repoId)
        eventLabelList = []
        # hanlpUnit=HanlpUnit()
        #这边要修  我们要从事类目开始查询
        for i in retTriggerWordList:
            tmpLableList = []
            ruleId = 1
            retTriggerWordDict = model_to_dict(i)
            triggerId = retTriggerWordDict['id']
            eventId = retTriggerWordDict['event_rule_id']
            #print(111,eventId)
            #触发词名字和触发词标注
            retEventRule = TEventRule.objects.get(id=eventId)
            #print(333,retEventRule.category_id)
            retCategoryName = TCategory.objects.get(
                id=retEventRule.category_id).category_name
            #print(444,retCategoryName)
            #这里的时候触发词的label要变成事件的label
            #到时候改一下
            triggerWord = retTriggerWordDict['trigger_word']
            triggerWordId = BaseController.get_category_name(
                request, retCategoryName)
            #print(222,eventId)

            eventRule = TEventRule.objects.get(id=eventId, repo_id=repoId)
            eventRuleDict = model_to_dict(eventRule)
            eventCategoryId = eventRuleDict['category_id']
            if (eventCategoryId not in lEventCategoryId):
                continue
            eventCategory = TCategory.objects.get(id=eventCategoryId,
                                                  repo_id=repoId,
                                                  create_id=createId)
            eventCategoryDict = model_to_dict(eventCategory)
            eventCategoryName = eventCategoryDict['category_name']
            tmpLableList.append(eventCategoryName)
            #事件类目

            subjectCategoryId = eventRuleDict['event_subject_id']
            subjectCategory = TCategory.objects.get(id=subjectCategoryId,
                                                    repo_id=repoId,
                                                    create_id=createId)
            subjectCategoryDict = model_to_dict(subjectCategory)
            subjectCategoryName = subjectCategoryDict['category_name']
            subjectId = BaseController.get_category_name(
                request, subjectCategoryName)
            tmpLableList.append(subjectId)
            retListId, retListVal = some_data_deal_func(
            ).inputCategoryIdReturnName(subjectCategoryId, repoId, createId)
            #对于retListVal里面的所有的值都把他们加入到分词器中然后进行分词
            #构造wordList word 和mask 对应
            constructWordList = []
            tmpSet = self.hanlp_tool.added_word_list
            #print(len(retListVal ))
            for word in retListVal:
                if (word == None):
                    continue
                tmpDict = {}
                tmpDict['word'] = word
                #print(word)
                #item["word"], item["mask"]
                tmpDict['mask'] = subjectId
                constructWordList.append(tmpDict)

            #这边这个要加入list[{'word':123,mask:13}]
            self.hanlp_tool.add_word_list(constructWordList)
            #print(constructWordList)
            objectCategoryId = eventRuleDict['event_object_id']
            negativeOne = -1
            if (objectCategoryId == negativeOne):
                ruleId = 2

            constructWordList = []
            tmpDict = {}
            tmpDict['word'] = triggerWord
            tmpDict['mask'] = str(triggerWordId)
            tmpSet = self.hanlp_tool.added_word_list
            constructWordList.append(tmpDict)
            self.hanlp_tool.add_word_list(constructWordList)
            tmpLableList.append(str(triggerWordId))
            print(ruleId)
            if (ruleId == 1):
                objectCategoryId = eventRuleDict['event_object_id']
                objectCategory = TCategory.objects.get(id=objectCategoryId,
                                                       repo_id=repoId,
                                                       create_id=createId)
                objectCategoryDict = model_to_dict(objectCategory)
                objectCategoryName = objectCategoryDict['category_name']
                objectId = BaseController.get_category_name(
                    request, objectCategoryName)
                retListId, retListVal = some_data_deal_func(
                ).inputCategoryIdReturnName(objectCategoryId, repoId, createId)
                tmpLableList.append(objectId)
                constructWordList = []
                tmpSet = self.hanlp_tool.added_word_list
                #这个代码有变动需要改一下
                for word in retListVal:
                    if (word == None):
                        continue
                    tmpDict = {}
                    tmpDict['word'] = word
                    # item["word"], item["mask"]
                    tmpDict['mask'] = str(objectId)
                    constructWordList.append(tmpDict)
                # 这边这个要加入list[{'word':123,mask:13}]
                #print(constructWordList)
                self.hanlp_tool.add_word_list(constructWordList)

            eventLabelList.append(tmpLableList)

        #eventLabelList
        #事件类目 事件主题  事件触发词 事件客体
        #print(eventLabelList)
        # print("list里面内容")
        # tmpS=self.hanlp_tool.added_word_list
        # for name in tmpS:
        #     print(name)
        #print("list里面内容结束")
        #return True
        #name
        attribute = TAttribute.objects.get(category_id=1)
        attributeDict = model_to_dict(attribute)
        attributeName = attributeDict['attribute_name']
        #print(self.hanlp_tool.added_word_list)
        cnt = 1
        for i in ret_entity_map:
            _id = i['_id']
            #根据这个id放回去就好了
            value = i['value']
            basetime = str(value['时间'])
            content = value['内容']
            text = HanlpUnit().get_text_from_html(content)
            sentenceList = self.hanlp_tool.split_paragraph(text)
            #print(sentenceList)
            #这边把所有的东西都拿出来
            event_extract_result = []
            count = 0
            countIndex = 0
            #时间 地点 事件主体 事件客体 主体的类目 和客体的类目
            tmpEventSet = set()
            for sent in sentenceList:
                sent = sent.strip()
                #print(sent)
                #对每一个sent进行分词获取他们的事件
                #11111
                #sent="浙江杭州明天林更新出演动作喜剧《快手枪手快枪手》"
                sentenceDealResult = self.hanlp_tool.cut(sent)
                event = self.eventExtractionByTemplateMatching(
                    sent, eventLabelList)
                #事件抽取完成
                #dateTime还要调整一下basetime会出问题
                #print(basetime)

                dateTime = basetime
                timeIndex = -1
                #print(123,timeIndex)
                timeIndex, timeWord, dateTime = Time_deal().dealTime(
                    sent, basetime)
                if (timeIndex != -1):
                    timeIndex = timeIndex + countIndex
                #print(46, timeIndex)
                #print(11111111,dateTime)

                locationList = Time_deal().dealArea(sent)
                location = ''
                locationindex = -1
                for val in locationList:
                    if (len(val['place']) > len(location)):
                        location = val['place']
                        locationindex = val['index'] + countIndex
                #print(location,locationindex)
                countIndex += len(sentenceDealResult)

                #这三个的名字需要和事件一起返回
                #print(event)
                for eve in event:
                    ruleId = 1
                    if (len(eve) == 3):
                        ruleId = 2
                    eveId = eve[0]
                    subjectLabel = eventLabelList[eveId][1]
                    #triggerLabel = BaseController.get_category_name()eventLabelList[eveId][0]

                    attribute = {}
                    attribute['发生时间'] = dateTime
                    attribute['地点'] = location
                    eveString = ''

                    for j in range(1, len(eve), 1):
                        eveString = eveString + str(eve[j])
                    attribute['名字'] = eveString
                    #eventlabel要通过查询结果得到
                    eventLabel = BaseController.get_category_name(
                        request, eventLabelList[eveId][0])
                    #print(eventLabel)
                    #print(eventLabelList[eveId])
                    #print(event)
                    subjectLabel = eventLabelList[eveId][1]

                    Neo4j().create_node_mjy_edition(eventLabel, attribute)
                    subjectNameVal = eve[1]
                    # print(subjectCategoryName,attributeName,subjectNameVal)
                    neo4jSubjectId = Neo4j().quesIdByLabelAttribute(
                        subjectLabel, attributeName,
                        '\'' + subjectNameVal + '\'')
                    neo4jEventId = Neo4j().quesIdByLabelAttribute(
                        eventLabel, '名字', '\'' + eveString + '\'')
                    Neo4j().createRelationship(subjectLabel, eventLabel,
                                               "主谓关系", {'id': neo4jSubjectId},
                                               {'id': neo4jEventId})
                    if (ruleId == 1):
                        objectNameVal = eve[3]
                        objectLabel = eventLabelList[eveId][3]
                        neo4jObjectId = Neo4j().quesIdByLabelAttribute(
                            objectLabel, attributeName,
                            '\'' + objectNameVal + '\'')
                        Neo4j().createRelationship(eventLabel, objectLabel,
                                                   "动宾关系",
                                                   {'id': neo4jEventId},
                                                   {'id': neo4jObjectId})
                        #print(neo4jSubjectId, neo4jEventId, neo4jObjectId)
                    tmpEventDict = {}
                    tmpEventDict['actual_event_time'] = dateTime
                    #事件抽取内容拿出来
                    tmpEventDict['time'] = timeWord
                    tmpEventDict['timeIndex'] = timeIndex
                    tmpEventDict['location'] = location
                    tmpEventDict['locationIndex'] = locationindex
                    #print(111,dateTime,location)
                    tmpEventDict['eventSubject'] = eve[1]
                    tmpEventDict['eventSubjectLabel'] = subjectLabel
                    tmpEventDict['triggerLabel'] = eventLabel
                    tmpEventDict['triggerWord'] = eve[2]
                    tmpEventDict['eventName'] = eveString
                    if (ruleId == 1):
                        tmpEventDict['eventObject'] = eve[3]
                        objectLabel = eventLabelList[eveId][3]
                        tmpEventDict['eventObjectLabel'] = objectLabel
                    if (eveString not in tmpEventSet):
                        tmpEventSet.add(eveString)
                        event_extract_result.append(tmpEventDict)
                    print(tmpEventDict)
                    count += 1
            #插入到mongodb
            #print(count,event_extract_result)
            news_col.update_one(
                {'_id': _id},
                {"$set": {
                    'event_extract_result': event_extract_result
                }})
            #news_col.insert_one()
            cnt += 1
            #if(cnt>=2):
            #     break
        return True
예제 #17
0
 def __init__(self):
     self.__context = {}
     self.knowledge_col = Mongodb(db='knowledge',
                                  collection='text').get_collection()
예제 #18
0
 def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False,
              proxy_ip_from="", spider_id='2', data_queue=None):
     Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay,
                     isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy,
                     proxy_ip_from=proxy_ip_from)
     self.movie_col = Mongodb(db='knowledge', collection='text').get_collection()