Пример #1
0
def process_area_page(area):
	"""
	根据area页面:
	1, 取得area页面
	2, 解析出基本信息, 如max_page等
	3, 生成待抓取url
	4, 根据参数, 抓取url, 将页面存放在文件中, 或直接从文件中读取
	5, 解析页面, 获得数据
	6, 存入mysql
	"""
	#1, 取得area页面
	if args_mode == "from_net":
		page = spider_and_save_area_page(area)
	else:		
		filename = filenames.get_filename_area(area)
		page = utils.read_file(filename)

	#2, 解析出基本信息, 如max_page等
	parser = parse_xiaoqu.XiaoquParser()
	parser.feed(page)
	max_page = parser.max_page

	result_list = []
	mclient = mysql.MysqlClient()
	mysql.create_table_xiaoqu()
	for i, page_num in enumerate(range(max_page)):
		#3, 生成待抓取url
		xiaoqu_url = urls.gen_url_xiaoqu(area, page_num+1)

		#4, 根据参数, 抓取url, 将页面存放在文件中, 或直接从文件中读取
		filename = filenames.get_filename_xiaoqu(area, xiaoqu_url)
		if args_mode == "from_net":
			page = utils.fetch_page(xiaoqu_url)
			saver = utils.DataSaver(filename)
			saver.save(page)
		page = utils.read_file(filename)
		
		#5, 解析页面, 获得数据
		parser = parse_xiaoqu.XiaoquParser()
		parser.feed(page)
		result_list += parser.output()
		for result in result_list:
			print "get xiaoqu:",result

			#6, 存入mysql
			cmd = "delete from xiaoqu where name = '%s'" %(result)
			mclient.execute(cmd)
			cmd = "insert into xiaoqu (name, area) values ('%s', '%s')" % (result, area)
			#print cmd
			mclient.execute(cmd)

		if i == 2:
			break
			pass
Пример #2
0
def sync_problem_files(problem_id, update: Callable[[str], None],
                       http_client: Session):
    file_list: list = http_client.post(urljoin(config.WEB_URL,
                                               "/api/judge/get_file_list"),
                                       data={
                                           "uuid": config.JUDGER_UUID,
                                           "problem_id": problem_id
                                       }).json()["data"]
    # 同步题目文件
    update("同步题目文件中")
    # 题目文件目录
    path = os.path.join(config.DATA_DIR, str(problem_id))
    os.makedirs(path, exist_ok=True)
    for file in file_list:
        current_file = os.path.join(path, file["name"])
        # 不存在 或者时间早于更新时间
        if not os.path.exists(current_file + ".lock") or float(
                read_file(current_file + ".lock")) < float(
                    file["last_modified_time"]):
            update(f"下载 {file['name']} 中..")
            print(f"Downloading {file}")
            with open(current_file, "wb") as target:
                target.write(
                    http_client.post(urljoin(config.WEB_URL,
                                             "/api/judge/download_file"),
                                     data={
                                         "problem_id": problem_id,
                                         "filename": file["name"],
                                         "uuid": config.JUDGER_UUID
                                     }).content)
            with open(current_file + ".lock", "w") as f:
                import time
                f.write(f"{time.time()}")
Пример #3
0
    def documents():
        """ Raw text document example for testing. NB. targets are actually names, not categories. """
        Z = {}
        Z_docs = []  # PEP8
        Z_names = []  # PEP8
        train = os.getcwd() + "/data/" + TRAIN_TEST
        bunch = load_files(train)  # -> sklearn 'bunch'
        X_raw, y = bunch.data, bunch.target  # list, np.arr PEP8
        # bunch.data are byte strings, not objects.
        doc_strings = etl.byte_str_to_str(X_raw)  # oh gosh
        df = pd.DataFrame(doc_strings, columns=['raw'])
        df.dropna(axis=0)
        df.columns = ['raw']
        df['target'] = y.tolist(
        )  # @TODO: length check skipped, needs to be added.

        # Why do you need an index column if you already have an index? 🤔 @TODO: kill this.
        # if we have an an index col:
        # df.set_index('id', inplace = True)
        #else:
        #df['id'] = df.index

        # Z is our predict axis.
        path = 'data/' + DOMAIN + '/predict/'
        predict_files = get_reg_files(path)  # -> list of test docs

        for f in predict_files:
            Z_names.append(f)
            Z_docs.append(read_file(path + f))
            # Z[f] = read_file(path + f) # If items have names, which they generally don't.

        return df, [Z_names, Z_docs]
Пример #4
0
    def load_text(self, p, variant):
        filename = self.cache_dir + self.lang + '/' + str(p.latestRevision())

        if not os.path.exists(filename):
            html = self.get_html(p)
            new_html = common_html.get_head(
                u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>'

            root = etree.fromstring(new_html.encode('utf-8'))
            exclude = set()
            html_id = self.config[variant]['modernize_div_id']

            for it in root.findall(
                    ".//{http://www.w3.org/1999/xhtml}div[@id='%s']" %
                    html_id):
                exclude.add(it)

            text = self.get_etree_text(root, exclude)
            for d in self.config[variant]['transform']:
                text = re.sub(d[0], d[1], text)

            utils.write_file(filename, text)
        else:
            text = utils.read_file(filename)

        return text
Пример #5
0
    def __init__(self, parent=None):
        super(SearchEventApp, self).__init__(parent)
        self.setupUi(self)

        self.searchButton.clicked.connect(self.search_events)

        self.progressBar.setRange(0, 1)
        self.progressBar.setStyleSheet(
            read_file("config/progres_bar_stylesheet.css"))

        self.paint_cell_calendar_old = self.calendarWidget.paintCell
        self.calendarWidget.paintCell = self.paint_calendar_cell

        self.searchInput.returnPressed.connect(self.search_events)

        self.worker = Worker()

        self.connect(self.worker, QtCore.SIGNAL("worker_finished()"),
                     self.search_done)
        # change calendar when worker finishes
        self.calendarWidget.connect(self.worker,
                                    QtCore.SIGNAL("worker_finished()"),
                                    self.calendarWidget.updateCells)

        self.connect(self.worker, QtCore.SIGNAL("worker_error()"),
                     self.search_error)

        self.events = []
Пример #6
0
    def run(self, pinglun_file, O_seeds):
        """
        提取特征词/评价词
        :param pinglun_file: 评论文本
        :param O_seeds: 种子评价词
        :return:
        """
        logger.info('pipeline run...')

        if not os.path.exists(self._clean_file):
            logger.info('清洗文本')
            clean.clean_file(pinglun_file, self._clean_file)

        if not os.path.exists(self._relation_file):
            logger.info('句法解析')
            relation_parse.parse(self._clean_file, self._relation_file)

        logger.info('提取特征词/评价词, double propagation算法')
        S = self._iter_sentences_relations(self._relation_file)
        F, O, fcounter, ocounter, rcount = double_propagation.extract(
            O_seeds, S)

        utils.write_file(self._dp_f_file, F)
        utils.write_file(self._dp_o_file, O)
        utils.save_obj(fcounter, self._dp_f_counter)
        utils.save_obj(ocounter, self._dp_o_counter)

        logger.info('特征词/评价词剪枝')
        F, O = prune.prune(F, O, fcounter, ocounter, rcount, self._threshold)

        utils.write_file(self._prune_f_file, F)
        utils.write_file(self._prune_o_file, O)

        if not os.path.exists(self._word2vec_file):
            logger.info('训练word2vec模型')
            T = self._iter_sentences_tokens(self._relation_file)
            w2c.train(T, self._word2vec_file)

        model = w2c.get(self._word2vec_file)

        logger.info('聚类特征词')
        cf = cluster.create(F, model, preference=-30)
        features = ['%s %s' % (cls, ' '.join(cf[cls])) for cls in cf]
        utils.write_file(self._feature_file, features)

        logger.info('聚类评价词')
        O = utils.read_file(self._prune_o_file)
        of = cluster.create(O, model, preference=None)
        opinions = ['%s %s' % (cls, ' '.join(of[cls])) for cls in of]
        utils.write_file(self._opinion_file, opinions)

        logger.info('pipeline over.')

        return cf, of, F, O
Пример #7
0
 def __init__(self, lexicon_dir):
     self._lexicon = defaultdict(set)
     '''读取文件,加载词'''
     for word_file in os.listdir(lexicon_dir):
         word_type = word_file.replace('.txt', '')
         words = set([
             w
             for w in utils.read_file(os.path.join(lexicon_dir, word_file))
             if w
         ])
         self._lexicon[word_type] = words
Пример #8
0
    def create_chart_data(self, data):
        js_string = read_file("data_viewer/charts/bar_chart.txt")
        template_string = string.Template(js_string)

        labels = ""
        chart_data = ""
        for d in data["DATASET"]:
            labels = labels + "'" + str(d[0]) + "', "
            chart_data = chart_data + "'" + str(d[1]) + "', "

        chart_dict = dict()
        chart_dict["LABELS"] = labels[:-2]
        chart_dict["DATA"] = chart_data[:-2]
        chart_dict["DATASET_LABEL"] = data["DATASET_LABEL"]
        js_string = template_string.substitute(chart_dict)
        return js_string
Пример #9
0
    def __init__(self, f):
        lines = utils.read_file(f)

        self._clusters = {}

        for line in lines:
            if line.startswith('='):
                continue

            terms = [f for f in line.split(' ') if f.strip() != '']
            self._clusters[terms[0]] = set(terms)

        self._terms = set()
        for head in self._clusters:
            for feature in self._clusters[head]:
                self._terms.add(feature)
Пример #10
0
 def run(self):
     gallery = Gallery().search(tgid = self.chat_id)
     if gallery:
         newfile = self.bot.getFile(self.update.message.document.file_id)
         file_name = self.update.message.document.file_id
         newfile.download(file_name)
         writed = False
         if os.path.exists(file_name):
             writed = write_file(file_name, read_file(file_name, storage = 'local', append_path = False), acl = 'public-read', mime_type = self.update.message.document.mime_type)
             thumbnail(file_name)
             os.remove(file_name)
             write_file('%s.json' % file_name, self.update.to_json())
         if writed:
             file_id = File(gallery_eid = gallery.eid.value, file_id = self.update.message.document.file_id)
             file_id.save()
             sendLink = getattr(gallery, 'sendLink', None)
             if sendLink and sendLink.value:
                 self.text = 'File URL: %s' % url_for('image', file_id = file_id.eid.value, _external = True, disable_web_page_preview = True)
         else:
             self.text = 'Failed to download file'
     else:
         self.text = 'Gallery does not exist, please create first'
Пример #11
0
    def load_text(self, p, variant):
        filename = self.cache_dir + self.lang + '/' + str(p.latestRevision())

        if not os.path.exists(filename):
            html = self.get_html(p)
            new_html = common_html.get_head(u'TITLE') + u"\n<body>"  + html + u'\n</body>\n</html>'

            root = etree.fromstring(new_html.encode('utf-8'))
            exclude = set()
            html_id = self.config[variant]['modernize_div_id']

            for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id):
                exclude.add(it)

            text = self.get_etree_text(root, exclude)
            for d in self.config[variant]['transform']:
                text = re.sub(d[0], d[1], text)

            utils.write_file(filename, text)
        else:
            text = utils.read_file(filename)

        return text
Пример #12
0
 def create_chart_data(self, data):
     """Takes a data structure and creates the data string to populate a pie chart."""
     # data is list of tuples, of the form name-value
     js_string = read_file("data_viewer/charts/pie_chart.txt")
     template_string = string.Template(js_string)
     labels = ""
     chart_data = ""
     background_colors = ""
     i = 0
     for d in data:
         labels = labels + "'" + d[0] + "', "
         chart_data = chart_data + "'" + str(d[1]) + "', "
         background_colors = background_colors + "'" + colors[i] + "', "
         i += 1
         if i == 9:
             break
     chart_dict = dict()
     # chart_dict["TYPE"] = chart_type
     chart_dict["LABELS"] = labels[:-2]
     chart_dict["DATA"] = chart_data[:-2]
     chart_dict["COLORS"] = background_colors[:-2]
     js_string = template_string.substitute(chart_dict)
     return js_string
Пример #13
0
def dropOverDueTable():
    # 获取当前时间
    today = datetime.datetime.now()
    # 计算偏移量
    offset = datetime.timedelta(days=-3)
    # 获取想要的日期的时间
    re_date = (today + offset).strftime('%Y%m%d')
    tablename = 'company_origin_info_temporary_' + str(re_date)
    DROPTABLE_SQL = "drop table if exists {tablename}".format(
        tablename=tablename)
    cursor.execute(DROPTABLE_SQL)
    print('删除表:' + tablename)


if __name__ == '__main__':
    # 1、创建临时表,更新文件中表名
    msg = createTemporaryTable()
    file_path = get_file_path()
    oldMsg = read_file(file_path)
    alter_file('tablename.txt', oldMsg, msg)

    # 2、调度爬虫,获取数据
    schedulingSpiders()

    # 3、数据更新
    keys = CompanyValues()
    keys.lingResult()

    # 4、删除过期表
    dropOverDueTable()
Пример #14
0
def telegramWebHook():
    update = Update.de_json(request.get_json(force=True))
    text = None
    if getattr(update.message, 'document'):
        gallery = Gallery().search(tgid = update.message.chat.id)
        if gallery:
            newfile = bot.getFile(update.message.document.file_id)
            file_name = update.message.document.file_id
            newfile.download(file_name)
            writed = False
            if os.path.exists(file_name):
                writed = write_file(file_name, read_file(file_name, storage = 'local', append_path = False), acl = 'public-read', mime_type = update.message.document.mime_type)
                thumbnail(file_name)
                os.remove(file_name)
                write_file('%s.json' % file_name, update.to_json())
            if writed:
                file_id = File(gallery_eid = gallery.eid, file_id = update.message.document.file_id)
                file_id.save()
                sendLink = getattr(gallery, 'sendLink', None)
                if sendLink == 'True':
                    text = 'File URL: %s' % url_for('image', file_id = file_id.eid, _external = True, disable_web_page_preview = True)
            else:
                text = 'Failed to download file'
        else:
            text = 'Gallery does not exist, please create first'
        pass
    if getattr(update.message, 'text'):
        args = update.message.text.split(' ', 2)
        if args[0] == '/register':
            text = 'Username:'******'Complete register: https://telegram.me/ACSGalleryBot?start=%s' % update.message.from_user.id
            else:
                text = 'User added to gallery'
            # set gallery permission at this point because i have chat id
        elif args[0] == '/start':
            if len(args) > 1 and int(args[1]) == int(update.message.chat.id):
                text = 'Username:'******'force_reply' : True })
            else:
                text = update.to_json()

        elif getattr(update.message, 'reply_to_message'):
            if update.message.reply_to_message.text == 'Username:'******'Password:'******'force_reply' : True })
                return 'ok'
            elif update.message.reply_to_message.text == 'Password:'******'User succesfuly registered'
        elif args[0] == '/create':
            if hasattr(update.message.chat, 'title'):
                gallery = Gallery().search(tgid = update.message.chat.id)
                if not gallery:
                    gallery = Gallery(tgid = update.message.chat.id, title = update.message.chat.title).save()
                text = 'Gallery URL: %s' % url_for('gallery', id = gallery.eid, _external = True, _scheme = 'https')
            else:
                text = 'Bot only works in groups'
        elif args[0] == '/remove':
            gallery = Gallery().search(tgid = update.message.chat.id)
            if gallery:
                gallery.delete()
                text = 'Gallery deleted'
            else:
                text = 'Gallery is not registered'
            # TODO: Confirm
        elif args[0] == '/config':
            args.pop(0)
            gallery = Gallery.search(tgid = update.message.chat.id)
            if gallery:
                if len(args) == 0:
                    text = g.config(update.message.chat.id)
                elif len(args) == 1:
                    text = 'get one'
                    text = g.config(update.message.chat.id, args[0])
                else:
                    text = g.config(update.message.chat.id, args[0], args[1])
            else:
                text = 'Gallery is not registered'
        #else:
        #    text = update.to_json()
    if text:
        bot.sendMessage(update.message.chat.id, text, disable_web_page_preview=True)
    return ""