def process_area_page(area): """ 根据area页面: 1, 取得area页面 2, 解析出基本信息, 如max_page等 3, 生成待抓取url 4, 根据参数, 抓取url, 将页面存放在文件中, 或直接从文件中读取 5, 解析页面, 获得数据 6, 存入mysql """ #1, 取得area页面 if args_mode == "from_net": page = spider_and_save_area_page(area) else: filename = filenames.get_filename_area(area) page = utils.read_file(filename) #2, 解析出基本信息, 如max_page等 parser = parse_xiaoqu.XiaoquParser() parser.feed(page) max_page = parser.max_page result_list = [] mclient = mysql.MysqlClient() mysql.create_table_xiaoqu() for i, page_num in enumerate(range(max_page)): #3, 生成待抓取url xiaoqu_url = urls.gen_url_xiaoqu(area, page_num+1) #4, 根据参数, 抓取url, 将页面存放在文件中, 或直接从文件中读取 filename = filenames.get_filename_xiaoqu(area, xiaoqu_url) if args_mode == "from_net": page = utils.fetch_page(xiaoqu_url) saver = utils.DataSaver(filename) saver.save(page) page = utils.read_file(filename) #5, 解析页面, 获得数据 parser = parse_xiaoqu.XiaoquParser() parser.feed(page) result_list += parser.output() for result in result_list: print "get xiaoqu:",result #6, 存入mysql cmd = "delete from xiaoqu where name = '%s'" %(result) mclient.execute(cmd) cmd = "insert into xiaoqu (name, area) values ('%s', '%s')" % (result, area) #print cmd mclient.execute(cmd) if i == 2: break pass
def sync_problem_files(problem_id, update: Callable[[str], None], http_client: Session): file_list: list = http_client.post(urljoin(config.WEB_URL, "/api/judge/get_file_list"), data={ "uuid": config.JUDGER_UUID, "problem_id": problem_id }).json()["data"] # 同步题目文件 update("同步题目文件中") # 题目文件目录 path = os.path.join(config.DATA_DIR, str(problem_id)) os.makedirs(path, exist_ok=True) for file in file_list: current_file = os.path.join(path, file["name"]) # 不存在 或者时间早于更新时间 if not os.path.exists(current_file + ".lock") or float( read_file(current_file + ".lock")) < float( file["last_modified_time"]): update(f"下载 {file['name']} 中..") print(f"Downloading {file}") with open(current_file, "wb") as target: target.write( http_client.post(urljoin(config.WEB_URL, "/api/judge/download_file"), data={ "problem_id": problem_id, "filename": file["name"], "uuid": config.JUDGER_UUID }).content) with open(current_file + ".lock", "w") as f: import time f.write(f"{time.time()}")
def documents(): """ Raw text document example for testing. NB. targets are actually names, not categories. """ Z = {} Z_docs = [] # PEP8 Z_names = [] # PEP8 train = os.getcwd() + "/data/" + TRAIN_TEST bunch = load_files(train) # -> sklearn 'bunch' X_raw, y = bunch.data, bunch.target # list, np.arr PEP8 # bunch.data are byte strings, not objects. doc_strings = etl.byte_str_to_str(X_raw) # oh gosh df = pd.DataFrame(doc_strings, columns=['raw']) df.dropna(axis=0) df.columns = ['raw'] df['target'] = y.tolist( ) # @TODO: length check skipped, needs to be added. # Why do you need an index column if you already have an index? 🤔 @TODO: kill this. # if we have an an index col: # df.set_index('id', inplace = True) #else: #df['id'] = df.index # Z is our predict axis. path = 'data/' + DOMAIN + '/predict/' predict_files = get_reg_files(path) # -> list of test docs for f in predict_files: Z_names.append(f) Z_docs.append(read_file(path + f)) # Z[f] = read_file(path + f) # If items have names, which they generally don't. return df, [Z_names, Z_docs]
def load_text(self, p, variant): filename = self.cache_dir + self.lang + '/' + str(p.latestRevision()) if not os.path.exists(filename): html = self.get_html(p) new_html = common_html.get_head( u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' root = etree.fromstring(new_html.encode('utf-8')) exclude = set() html_id = self.config[variant]['modernize_div_id'] for it in root.findall( ".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id): exclude.add(it) text = self.get_etree_text(root, exclude) for d in self.config[variant]['transform']: text = re.sub(d[0], d[1], text) utils.write_file(filename, text) else: text = utils.read_file(filename) return text
def __init__(self, parent=None): super(SearchEventApp, self).__init__(parent) self.setupUi(self) self.searchButton.clicked.connect(self.search_events) self.progressBar.setRange(0, 1) self.progressBar.setStyleSheet( read_file("config/progres_bar_stylesheet.css")) self.paint_cell_calendar_old = self.calendarWidget.paintCell self.calendarWidget.paintCell = self.paint_calendar_cell self.searchInput.returnPressed.connect(self.search_events) self.worker = Worker() self.connect(self.worker, QtCore.SIGNAL("worker_finished()"), self.search_done) # change calendar when worker finishes self.calendarWidget.connect(self.worker, QtCore.SIGNAL("worker_finished()"), self.calendarWidget.updateCells) self.connect(self.worker, QtCore.SIGNAL("worker_error()"), self.search_error) self.events = []
def run(self, pinglun_file, O_seeds): """ 提取特征词/评价词 :param pinglun_file: 评论文本 :param O_seeds: 种子评价词 :return: """ logger.info('pipeline run...') if not os.path.exists(self._clean_file): logger.info('清洗文本') clean.clean_file(pinglun_file, self._clean_file) if not os.path.exists(self._relation_file): logger.info('句法解析') relation_parse.parse(self._clean_file, self._relation_file) logger.info('提取特征词/评价词, double propagation算法') S = self._iter_sentences_relations(self._relation_file) F, O, fcounter, ocounter, rcount = double_propagation.extract( O_seeds, S) utils.write_file(self._dp_f_file, F) utils.write_file(self._dp_o_file, O) utils.save_obj(fcounter, self._dp_f_counter) utils.save_obj(ocounter, self._dp_o_counter) logger.info('特征词/评价词剪枝') F, O = prune.prune(F, O, fcounter, ocounter, rcount, self._threshold) utils.write_file(self._prune_f_file, F) utils.write_file(self._prune_o_file, O) if not os.path.exists(self._word2vec_file): logger.info('训练word2vec模型') T = self._iter_sentences_tokens(self._relation_file) w2c.train(T, self._word2vec_file) model = w2c.get(self._word2vec_file) logger.info('聚类特征词') cf = cluster.create(F, model, preference=-30) features = ['%s %s' % (cls, ' '.join(cf[cls])) for cls in cf] utils.write_file(self._feature_file, features) logger.info('聚类评价词') O = utils.read_file(self._prune_o_file) of = cluster.create(O, model, preference=None) opinions = ['%s %s' % (cls, ' '.join(of[cls])) for cls in of] utils.write_file(self._opinion_file, opinions) logger.info('pipeline over.') return cf, of, F, O
def __init__(self, lexicon_dir): self._lexicon = defaultdict(set) '''读取文件,加载词''' for word_file in os.listdir(lexicon_dir): word_type = word_file.replace('.txt', '') words = set([ w for w in utils.read_file(os.path.join(lexicon_dir, word_file)) if w ]) self._lexicon[word_type] = words
def create_chart_data(self, data): js_string = read_file("data_viewer/charts/bar_chart.txt") template_string = string.Template(js_string) labels = "" chart_data = "" for d in data["DATASET"]: labels = labels + "'" + str(d[0]) + "', " chart_data = chart_data + "'" + str(d[1]) + "', " chart_dict = dict() chart_dict["LABELS"] = labels[:-2] chart_dict["DATA"] = chart_data[:-2] chart_dict["DATASET_LABEL"] = data["DATASET_LABEL"] js_string = template_string.substitute(chart_dict) return js_string
def __init__(self, f): lines = utils.read_file(f) self._clusters = {} for line in lines: if line.startswith('='): continue terms = [f for f in line.split(' ') if f.strip() != ''] self._clusters[terms[0]] = set(terms) self._terms = set() for head in self._clusters: for feature in self._clusters[head]: self._terms.add(feature)
def run(self): gallery = Gallery().search(tgid = self.chat_id) if gallery: newfile = self.bot.getFile(self.update.message.document.file_id) file_name = self.update.message.document.file_id newfile.download(file_name) writed = False if os.path.exists(file_name): writed = write_file(file_name, read_file(file_name, storage = 'local', append_path = False), acl = 'public-read', mime_type = self.update.message.document.mime_type) thumbnail(file_name) os.remove(file_name) write_file('%s.json' % file_name, self.update.to_json()) if writed: file_id = File(gallery_eid = gallery.eid.value, file_id = self.update.message.document.file_id) file_id.save() sendLink = getattr(gallery, 'sendLink', None) if sendLink and sendLink.value: self.text = 'File URL: %s' % url_for('image', file_id = file_id.eid.value, _external = True, disable_web_page_preview = True) else: self.text = 'Failed to download file' else: self.text = 'Gallery does not exist, please create first'
def load_text(self, p, variant): filename = self.cache_dir + self.lang + '/' + str(p.latestRevision()) if not os.path.exists(filename): html = self.get_html(p) new_html = common_html.get_head(u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' root = etree.fromstring(new_html.encode('utf-8')) exclude = set() html_id = self.config[variant]['modernize_div_id'] for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id): exclude.add(it) text = self.get_etree_text(root, exclude) for d in self.config[variant]['transform']: text = re.sub(d[0], d[1], text) utils.write_file(filename, text) else: text = utils.read_file(filename) return text
def create_chart_data(self, data): """Takes a data structure and creates the data string to populate a pie chart.""" # data is list of tuples, of the form name-value js_string = read_file("data_viewer/charts/pie_chart.txt") template_string = string.Template(js_string) labels = "" chart_data = "" background_colors = "" i = 0 for d in data: labels = labels + "'" + d[0] + "', " chart_data = chart_data + "'" + str(d[1]) + "', " background_colors = background_colors + "'" + colors[i] + "', " i += 1 if i == 9: break chart_dict = dict() # chart_dict["TYPE"] = chart_type chart_dict["LABELS"] = labels[:-2] chart_dict["DATA"] = chart_data[:-2] chart_dict["COLORS"] = background_colors[:-2] js_string = template_string.substitute(chart_dict) return js_string
def dropOverDueTable(): # 获取当前时间 today = datetime.datetime.now() # 计算偏移量 offset = datetime.timedelta(days=-3) # 获取想要的日期的时间 re_date = (today + offset).strftime('%Y%m%d') tablename = 'company_origin_info_temporary_' + str(re_date) DROPTABLE_SQL = "drop table if exists {tablename}".format( tablename=tablename) cursor.execute(DROPTABLE_SQL) print('删除表:' + tablename) if __name__ == '__main__': # 1、创建临时表,更新文件中表名 msg = createTemporaryTable() file_path = get_file_path() oldMsg = read_file(file_path) alter_file('tablename.txt', oldMsg, msg) # 2、调度爬虫,获取数据 schedulingSpiders() # 3、数据更新 keys = CompanyValues() keys.lingResult() # 4、删除过期表 dropOverDueTable()
def telegramWebHook(): update = Update.de_json(request.get_json(force=True)) text = None if getattr(update.message, 'document'): gallery = Gallery().search(tgid = update.message.chat.id) if gallery: newfile = bot.getFile(update.message.document.file_id) file_name = update.message.document.file_id newfile.download(file_name) writed = False if os.path.exists(file_name): writed = write_file(file_name, read_file(file_name, storage = 'local', append_path = False), acl = 'public-read', mime_type = update.message.document.mime_type) thumbnail(file_name) os.remove(file_name) write_file('%s.json' % file_name, update.to_json()) if writed: file_id = File(gallery_eid = gallery.eid, file_id = update.message.document.file_id) file_id.save() sendLink = getattr(gallery, 'sendLink', None) if sendLink == 'True': text = 'File URL: %s' % url_for('image', file_id = file_id.eid, _external = True, disable_web_page_preview = True) else: text = 'Failed to download file' else: text = 'Gallery does not exist, please create first' pass if getattr(update.message, 'text'): args = update.message.text.split(' ', 2) if args[0] == '/register': text = 'Username:'******'Complete register: https://telegram.me/ACSGalleryBot?start=%s' % update.message.from_user.id else: text = 'User added to gallery' # set gallery permission at this point because i have chat id elif args[0] == '/start': if len(args) > 1 and int(args[1]) == int(update.message.chat.id): text = 'Username:'******'force_reply' : True }) else: text = update.to_json() elif getattr(update.message, 'reply_to_message'): if update.message.reply_to_message.text == 'Username:'******'Password:'******'force_reply' : True }) return 'ok' elif update.message.reply_to_message.text == 'Password:'******'User succesfuly registered' elif args[0] == '/create': if hasattr(update.message.chat, 'title'): gallery = Gallery().search(tgid = update.message.chat.id) if not gallery: gallery = Gallery(tgid = update.message.chat.id, title = update.message.chat.title).save() text = 'Gallery URL: %s' % url_for('gallery', id = gallery.eid, _external = True, _scheme = 'https') else: text = 'Bot only works in groups' elif args[0] == '/remove': gallery = Gallery().search(tgid = update.message.chat.id) if gallery: gallery.delete() text = 'Gallery deleted' else: text = 'Gallery is not registered' # TODO: Confirm elif args[0] == '/config': args.pop(0) gallery = Gallery.search(tgid = update.message.chat.id) if gallery: if len(args) == 0: text = g.config(update.message.chat.id) elif len(args) == 1: text = 'get one' text = g.config(update.message.chat.id, args[0]) else: text = g.config(update.message.chat.id, args[0], args[1]) else: text = 'Gallery is not registered' #else: # text = update.to_json() if text: bot.sendMessage(update.message.chat.id, text, disable_web_page_preview=True) return ""