示例#1
0
    def __init__(self):
        self.__ext_queue = {}
        if os.path.isfile(config.path_extract_list):
            self.__ext_queue = pickle.loads(
                open(config.path_extract_list).read())

        self.__ie = InfoExtractor(config.path_extract_onto + "/seminar.xml",
                                  config.path_extract_onto)
        # id : item{title, url, filename, decision }
        pass
示例#2
0
	def getResults(self):
		# Determine search options and formats...
		infoAggregator = InfoAggregator(self.input)
		# Have loop that determines when search terminates
		raw_output = infoAggregator.getRawResults()
		infoExtractor = InfoExtractor(raw_output)
		# Obtain knowledge graph
		k_graph = K_Graph().load()
		# Add all sentences
		for sent in infoExtractor.getExtractedResults():
			k_graph.add_point(self.input,sent)
		k_graph.save()
示例#3
0
    def __init__(self):
        self.__ext_queue = {}
        if os.path.isfile(config.path_extract_list):
            self.__ext_queue = pickle.loads(open(config.path_extract_list).read())

        self.__ie = InfoExtractor(
            config.path_extract_onto + "/seminar.xml", config.path_extract_onto)
        pass
示例#4
0
def get_all_subject(insertDB):
    """

    :param insertDB: if True, then all of the data will be inserted into mongoDB
    """
    start = time.time()
    builder = addrBuilder('2018', 'spring')
    subjectGetter = XmlGetter()
    subjectGetter.add_get_content_coroutine(builder.build())
    print('Please wait, getting all subjects...')
    a = subjectGetter.run_getter()
    all_addr = InfoExtractor(a)
    addr_list = all_addr.find_all_sub()
    courseGetter = XmlGetter()
    for i in addr_list:
        courseGetter.add_get_content_coroutine(i)
    print('Please wait, getting all courses...')
    a = courseGetter.run_getter()
    all_courses = InfoExtractor(a)
    courses_list = all_courses.find_all_courses()

    detail_courses = list(map(lambda x: x + '?mode=detail', courses_list))
    detailGetter = XmlGetter()
    for i in detail_courses:
        detailGetter.add_get_content_coroutine(i)
    print('Please wait, getting all sections...')
    a = detailGetter.run_getter()
    all_detail = JsonDumper(a)

    all_detail.dump('allCourses.json', insertDB)

    print('Time used: ', time.time() - start)
示例#5
0
class SAEExtractor:
    def __init__(self):
        self.__ext_queue = {}
        if os.path.isfile(config.path_extract_list):
            self.__ext_queue = pickle.loads(open(config.path_extract_list).read())

        self.__ie = InfoExtractor(
            config.path_extract_onto + "/seminar.xml", config.path_extract_onto)
        pass

    def save(self):
        queue_file = open(config.path_extract_list, "w")
        queue_file.write(pickle.dumps(self.__ext_queue, -1))
        queue_file.close()

    def __auto_extract(self, ):
        pass

    def __extract(self,item,extractor):
        result = self.__ie.extract(item, extractor)
        log.info(extractor)
        log.info(result)
        info = dict()
        for att, str in result.iteritems():
            info[self.__ie.db_col(att)] = str[0:self.__ie.max_len(att)]
        log.info(info)
        db.new_sem_with_map(item['id'], info)
        item['extractor'] = extractor
        item.save()

        os.remove(config.path_extractor_inbox + "/%s" % item.filename())

    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(
            id=item_id,file_path=config.path_extractor_inbox)

        count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
        log.info(str(maps))
        log.info(count)
        if len(maps) > 0:
            import operator

            tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
            log.info(float(tar_ext[1]) / len(maps))
            if tar_ext[1] > config.extractor_same_layout_number:
                extractor = tool.str2extractor(tar_ext[0])
                self.__extract(item,extractor)
                return

        extractor = config.const_RULE_UNKNOW

        self.__ext_queue[item_id] = {
            "title": item['title'],
            "url": item['url'],
            "filename": item.filename(),
            "decision": item['is_target'],
            "extractor": extractor
        }

        log.info("[%s]: # %s " % (item_id, extractor))

        pass

    def __op_refresh(self,data_loaded,connection):
        delete_ids = []
        for key, ent in self.__ext_queue.iteritems():
            item_id = int(key)
            if not os.path.isfile(
                    config.path_extractor_inbox+"/"+str(item_id)+".html"):
                delete_ids.append(item_id)
                continue
            item = UrlItem.load_with_content(
                id=item_id,file_path=config.path_extractor_inbox)
            count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
            log.info(str(maps))
            log.info(count)
            if len(maps) > 0:
                import operator
                tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
                log.info(float(tar_ext[1]) / len(maps))
                if tar_ext[1] > config.extractor_same_layout_number:
                    extractor = tool.str2extractor(tar_ext[0])
                    self.__extract(item,extractor)
                    delete_ids.append(item_id)
        for ent_id in delete_ids:
            del self.__ext_queue[ent_id]

    def __op_list(self, data_loaded, connection):
        tool.send_msg(connection, pickle.dumps(self.__ext_queue, -1))
        pass

    def __op_maps(self, data_loaded, connection):
        tool.send_msg(connection,
            pickle.dumps({
                'rule': self.__ie.map(),
                'action': self.__ie.action_map()
            }, -1))
        pass

    def __op_preview(self, data_loaded, connection):
        log.info(data_loaded['extractor'])
        if data_loaded['extractor'] == config.const_RULE_UNKNOW:
            result = {x: "" for x in xrange(1, self.__ie.num_attr() + 1)}
        else:
            item_id = int(data_loaded['id'])
            item = UrlItem.load_with_content(
                item_id, file_path=config.path_extractor_inbox)
            extractor = data_loaded['extractor']
            result = self.__ie.extract(item, extractor)
        preview = list()
        for att, str in result.iteritems():
            preview.insert(att, dict(name=self.__ie.name(att), value=str))
        log.info(preview)
        tool.send_msg(connection, pickle.dumps(preview, -1))
        pass

    def __op_rejudge_done(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        decision = int(data_loaded['decision'])
        item = UrlItem.load(id=item_id)
        del self.__ext_queue[item_id]
        self.__send_back_to_judge(item, decision)
        tool.send_msg(connection, "0")
        pass

    def __op_test_rule(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        rule = data_loaded['rule']
        attrid = int(data_loaded['attrid'])
        item = UrlItem.load_with_content(
            id=item_id, file_path=config.path_extractor_inbox)
        tool.send_msg(
            connection,
            self.__ie.extract_attr(item, rule_id_or_dict=rule, attr_id=attrid)
        )
        pass

    def __op_add_rule(self, data_loaded, connection):
        rule = data_loaded['rule']
        attrid = int(data_loaded['attrid'])
        new_id = self.__ie.add_rule(attrid, rule)
        tool.send_msg(
            connection,
            pickle.dumps({
                'rule': self.__ie.map(),
                'action': self.__ie.action_map()
                }, -1)
        )
        pass


    def __op_add_extractor(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        extractor = data_loaded['extractor']

        item = UrlItem.load_with_content(
            item_id, file_path=config.path_extractor_inbox)
        self.__extract(item,extractor)
        del self.__ext_queue[item['id']]
        tool.send_msg(
            connection,
            "0"
        )
        pass

    @staticmethod
    def __operations(cmd):
        maps = {
            config.socket_CMD_extractor_new: SAEExtractor.__op_new,
            config.socket_CMD_extractor_list: SAEExtractor.__op_list,
            config.socket_CMD_extractor_maps: SAEExtractor.__op_maps,
            config.socket_CMD_extractor_preview: SAEExtractor.__op_preview,
            config.socket_CMD_extractor_rejudge_done: SAEExtractor.__op_rejudge_done,
            config.socket_CMD_extractor_test_rule: SAEExtractor.__op_test_rule,
            config.socket_CMD_extractor_add_rule: SAEExtractor.__op_add_rule,
            config.socket_CMD_extractor_add_extract: SAEExtractor.__op_add_extractor,
            config.socket_CMD_extractor_refresh: SAEExtractor.__op_refresh,
        }
        return maps[cmd]

    @staticmethod
    def __send_back_to_judge(item, decision):
        shutil.move(config.path_extractor_inbox + "/%s" % item.filename(),
                    config.path_judge_inbox + "/%s" % item.filename())

        data = {"operation": config.socket_CMD_judge_new, "id": item['id'], "decision": decision}
        data_string = pickle.dumps(data, -1)
        tool.send_message(data_string, config.socket_addr_judge)

    def process(self, connection, client_address):
        try:
            data = tool.recv_msg(connection)
            data_loaded = pickle.loads(data)
            log.debug('new connection from %s', client_address)
            log.debug("data received: %s", data_loaded)
            self.__operations(data_loaded['operation'])(self, data_loaded, connection)
        finally:
            log.debug('connection closed for %s', client_address)
            connection.close()
示例#6
0
class SAEExtractor:
    def __init__(self):
        self.__ext_queue = {}
        if os.path.isfile(config.path_extract_list):
            self.__ext_queue = pickle.loads(
                open(config.path_extract_list).read())

        self.__ie = InfoExtractor(config.path_extract_onto + "/seminar.xml",
                                  config.path_extract_onto)
        # id : item{title, url, filename, decision }
        pass

    def save(self):
        # queue
        queue_file = open(config.path_extract_list, "w")
        queue_file.write(pickle.dumps(self.__ext_queue, -1))
        queue_file.close()

    def __auto_extract(self, ):
        pass

    def __extract(self, item, extractor):
        result = self.__ie.extract(item, extractor)
        log.info(extractor)
        log.info(result)
        info = dict()
        for att, str in result.iteritems():
            info[self.__ie.db_col(att)] = str[0:self.__ie.max_len(att)]
        log.info(info)
        db.new_sem_with_map(item['id'], info)
        item['extractor'] = extractor
        item.save()

        os.remove(config.path_extractor_inbox + "/%s" % item.filename())

    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(id=item_id,
                                         file_path=config.path_extractor_inbox)

        count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
        log.info(str(maps))
        log.info(count)
        if len(maps) > 0:
            import operator

            tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
            log.info(float(tar_ext[1]) / len(maps))
            if tar_ext[1] > config.extractor_same_layout_number:
                extractor = tool.str2extractor(tar_ext[0])
                self.__extract(item, extractor)
                return

        extractor = config.const_RULE_UNKNOW

        self.__ext_queue[item_id] = {
            "title": item['title'],
            "url": item['url'],
            "filename": item.filename(),
            "decision": item['is_target'],
            "extractor": extractor
        }

        log.info("[%s]: # %s " % (item_id, extractor))

        pass

    def __op_refresh(self, data_loaded, connection):
        delete_ids = []
        for key, ent in self.__ext_queue.iteritems():
            item_id = int(key)
            item = UrlItem.load_with_content(
                id=item_id, file_path=config.path_extractor_inbox)
            count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
            log.info(str(maps))
            log.info(count)
            if len(maps) > 0:
                import operator
                tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
                log.info(float(tar_ext[1]) / len(maps))
                if tar_ext[1] > config.extractor_same_layout_number:
                    extractor = tool.str2extractor(tar_ext[0])
                    self.__extract(item, extractor)
                    delete_ids.append(item_id)
        # clear delete_ids
        for ent_id in delete_ids:
            del self.__ext_queue[ent_id]

    def __op_list(self, data_loaded, connection):
        tool.send_msg(connection, pickle.dumps(self.__ext_queue, -1))
        pass

    def __op_maps(self, data_loaded, connection):
        tool.send_msg(
            connection,
            pickle.dumps(
                {
                    'rule': self.__ie.map(),
                    'action': self.__ie.action_map()
                }, -1))
        pass

    def __op_preview(self, data_loaded, connection):
        log.info(data_loaded['extractor'])
        if data_loaded['extractor'] == config.const_RULE_UNKNOW:
            result = {x: "" for x in xrange(1, self.__ie.num_attr() + 1)}
        else:
            item_id = int(data_loaded['id'])
            item = UrlItem.load_with_content(
                item_id, file_path=config.path_extractor_inbox)
            extractor = data_loaded['extractor']
            result = self.__ie.extract(item, extractor)
        preview = list()
        for att, str in result.iteritems():
            preview.insert(att, dict(name=self.__ie.name(att), value=str))
        log.info(preview)
        tool.send_msg(connection, pickle.dumps(preview, -1))
        pass

    def __op_rejudge_done(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        decision = int(data_loaded['decision'])
        item = UrlItem.load(id=item_id)
        del self.__ext_queue[item_id]
        self.__send_back_to_judge(item, decision)
        tool.send_msg(connection, "0")
        pass

    def __op_test_rule(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        rule = data_loaded['rule']
        attrid = int(data_loaded['attrid'])
        item = UrlItem.load_with_content(id=item_id,
                                         file_path=config.path_extractor_inbox)
        tool.send_msg(
            connection,
            self.__ie.extract_attr(item, rule_id_or_dict=rule, attr_id=attrid))
        pass

    def __op_add_rule(self, data_loaded, connection):
        rule = data_loaded['rule']
        attrid = int(data_loaded['attrid'])
        new_id = self.__ie.add_rule(attrid, rule)
        tool.send_msg(
            connection,
            pickle.dumps(
                {
                    'rule': self.__ie.map(),
                    'action': self.__ie.action_map()
                }, -1))
        pass

    def __op_add_extractor(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        extractor = data_loaded['extractor']

        item = UrlItem.load_with_content(item_id,
                                         file_path=config.path_extractor_inbox)
        self.__extract(item, extractor)
        del self.__ext_queue[item['id']]
        tool.send_msg(connection, "0")
        pass

    @staticmethod
    def __operations(cmd):
        maps = {
            config.socket_CMD_extractor_new: SAEExtractor.__op_new,
            config.socket_CMD_extractor_list: SAEExtractor.__op_list,
            config.socket_CMD_extractor_maps: SAEExtractor.__op_maps,
            config.socket_CMD_extractor_preview: SAEExtractor.__op_preview,
            config.socket_CMD_extractor_rejudge_done:
            SAEExtractor.__op_rejudge_done,
            config.socket_CMD_extractor_test_rule: SAEExtractor.__op_test_rule,
            config.socket_CMD_extractor_add_rule: SAEExtractor.__op_add_rule,
            config.socket_CMD_extractor_add_extract:
            SAEExtractor.__op_add_extractor,
            config.socket_CMD_extractor_refresh: SAEExtractor.__op_refresh,
        }
        return maps[cmd]

    @staticmethod
    def __send_back_to_judge(item, decision):
        # move file
        shutil.move(config.path_extractor_inbox + "/%s" % item.filename(),
                    config.path_judge_inbox + "/%s" % item.filename())

        # SIGNAL
        data = {
            "operation": config.socket_CMD_judge_new,
            "id": item['id'],
            "decision": decision
        }
        data_string = pickle.dumps(data, -1)
        tool.send_message(data_string, config.socket_addr_judge)

    def process(self, connection, client_address):
        try:
            data = tool.recv_msg(connection)
            data_loaded = pickle.loads(data)
            log.debug('new connection from %s', client_address)
            log.debug("data received: %s", data_loaded)
            self.__operations(data_loaded['operation'])(self, data_loaded,
                                                        connection)
        finally:
            # Clean up the connection
            log.debug('connection closed for %s', client_address)
            connection.close()