async def main(): conf = Config() logging.basicConfig(level=logging.DEBUG) logging.config.dictConfig(conf.DEFAULT_LOGGING) logger = logging.getLogger(__name__) db = ExtendedDBManager(init_db(conf)) db.database.create_tables([Article], safe=True) executor = ThreadPoolExecutor(max_workers=10) loop.set_default_executor(executor) DATA_FOR_MATPLOTLIB = {} await truncate(db=db) await vacuum(db=db) await drop_index(db=db) for mode in ["noindex", 'index']: await truncate(db=db) await vacuum(db=db) if mode == 'index': await create_index(db=db) else: await drop_index(db=db) for i in range(1, 81): await buck_create_new(db=db, epoch_count=i, count=10**6, mode=mode) row1 = await db.get(Article.select().limit(1)) row2 = await db.get(Article.select().order_by( Article.created_date.desc()).limit(1)) if mode == 'noindex': arv_time__noindex1 = await call_avr_time(db=db, text=row1.name) arv_time__noindex2 = await call_avr_time(db=db, text=row2.name) arv_time__noindex = max(arv_time__noindex1, arv_time__noindex2) logger.info(f"Time NoIndex={arv_time__noindex}") DATA_FOR_MATPLOTLIB[str(i)] = {"noindex": arv_time__noindex} else: arv_time__index1 = await call_avr_time(db=db, text=row1.name) arv_time__index2 = await call_avr_time(db=db, text=row2.name) arv_time__index = max(arv_time__index1, arv_time__index2) logger.info(f"Time Index={arv_time__index}") DATA_FOR_MATPLOTLIB[str(i)].update({"index": arv_time__index}) logger.info(f"") now_count = await db.count(Article.select()) logger.info(f"Row in db count = {now_count}") logger.info(f"== == " * 15) logger.info(f"== == " * 15) FileReader.write_data(DATA_FOR_MATPLOTLIB) logger.info(f"Exit")
def create_article(self, article: schemas.ArticleCreate) -> Any: """ Create New Article """ try: db_article = Article(user_id=article.user_id, article_title=article.article_title, article_text=article.article_text, tags=article.tags) db_article.save() return db_article except Exception as e: fastapi_logger.exception("create_article") return None
def test_db(client): mixin = DatabaseMixin() mixin.db = client.db mixin.Table = Article def check_article(first, second): assert first.title == second.title assert first.subtitle == second.subtitle assert first.article == second.article assert first.date == second.date assert first.image == second.image assert first.id == second.id with mixin.db.init_db(): with client.app.test_request_context(): articles = {} # create like_article = namedtuple( 'article', ['title', 'subtitle', 'article', 'date', 'image', 'id']) for i in range(1, 11): a = mixin.create(f'title{i}', f'subtitle{i}', f'article{i}', f'date{i}', f'image{i}') a_ = like_article(f'title{i}', f'subtitle{i}', f'article{i}', f'date{i}', f'image{i}', i) check_article(a, a_) articles[i] = a # get_many articles_ = mixin.get_many() for a_ in articles_: a = articles[a_.id] check_article(a, a_) # get_id a = articles[random.randrange(1, 11)] a_ = mixin.get_id(a.id) check_article(a, a_) # delete_many articles_ = mixin.get_many() assert len(articles_) == 10 mixin.delete_many() articles_ = mixin.get_many() assert len(articles_) == 0 # Article obj = Article('title', 'subtitle', 'article' * 100, 'date', 'image', 1) assert len(obj.get_text(300)) + len(obj.title) == 300
async def create_index(db, index="article_name"): try: await db.execute( Article.raw(sql=f"CREATE INDEX {index} ON article (name);")) except psycopg2.ProgrammingError as e: return True return False
async def truncate(db): try: logger.info("TRUNCATE done!") await db.execute(Article.raw(sql="TRUNCATE TABLE article;")) except psycopg2.ProgrammingError as e: return True return False
def populate(): startIndex = 0 count = 20 while (startIndex <= 300): r = requests.get('http://ign-apis.herokuapp.com/articles?startIndex=' + str(startIndex) + '0&count=' + str(count)) j = r.json() for item in j['data']: headline = item['metadata']['headline'] subHeadline = item['metadata']['subHeadline'] link = "ign.com/articles/" + item['metadata']['slug'] a = Article(headline=headline, subHeadline=subHeadline, link=link) saveArticle(a) startIndex += count startIndex = 0 count = 20 while (startIndex <= 300): r = requests.get('http://ign-apis.herokuapp.com/videos?startIndex=' + str(startIndex) + '0&count=' + str(count)) j = r.json() for item in j['data']: name = item['metadata']['name'] description = item['metadata']['description'] link = item['metadata']['url'] v = Video(name=name, description=description, link=link) saveVideo(v) startIndex += count
async def atom_task(db, data): query = Article.insert_many(data, fields=[ Article.status, Article.name, Article.body, Article.created_date ]) await db.execute(query) return True
def get_article(self, article_id: str): """ Get A Single article """ try: data = Article.objects(id=article_id).first() return data except Exception as e: fastapi_logger.exception("get_article") return None
def delete_article(self, article_id: str) -> Any: """ Delete Article """ try: db_article = Article.objects(id=article_id) db_article.delete() return True except Exception as e: fastapi_logger.exception("delete_article") return None
async def vacuum(db): try: logger.info("Prepare to VACUUM") await db.execute( Article.raw(sql="VACUUM(FULL, VERBOSE, ANALYZE) article;")) logger.info("VACUUM done!") except psycopg2.ProgrammingError as e: return True return False
def article(article_id): # 返回请求的文章 d = Article.select().where(Article.id == article_id) print(d) if not d: abort(404) d = d.get() path = '..\\' + d.path file = open(file=path, mode='r', encoding='utf-8') text = md(file.read()) return render_template('article.html', title=d.title,text=text)
def add2db(self, id, writer, ip, title, content, time_): '''디비에 넣는다. 이미 존재하는지는 add2db 호출전 IsArticleExist로 체크해줘야함 ''' timestamp = time.mktime(datetime.datetime.strptime(time_, "%Y-%m-%d %H:%M:%S").timetuple()) article = Article() article.id = id article.content = content article.writer = writer article.ip = ip article.title = title article.timestamp = timestamp article.category = self.categoryId article.comment = 0 article.isDelete = False self.s_db.add(article) self.s_db.commit() if self.debug: try: print '[add]', id, ':', int(time.time()) - int(timestamp), title except: print '[error]debug print error' return
def update_article(self, article_id: str, article: schemas.ArticleCreate) -> Any: """ Update Article """ try: db_article = Article.objects(id=article_id).first() db_article.article_title = article.article_title db_article.article_text = article.article_text db_article.tags.extend(article.tags) db_article.modified_timestamp = datetime.utcnow() db_article.save() return db_article except Exception as e: fastapi_logger.exception("update_article") return None
def create_article(topic_url, category_url): category = session.query(Category).filter_by(url=category_url).one() if 'username' not in login_session: return redirect('/login') if request.method == 'POST': user_id = login_session['user_id'] new_article = Article(name=request.form['aname'], content=request.form['acontent'], category_id=category.id, user_id=user_id) session.add(new_article) session.commit() return redirect( url_for('show_category', topic_url=topic_url, category_url=category_url)) else: return render_template('new_article.html', category=category)
def map_article(article): source = map_source(article['source']) return Article( id=article['id'], author=map_author(article['author'], source), title=article['title'], perex=article['perex'], body=article['body'], raw_body=article['raw_body'], published_at=article['published_at'], extracted_at=article['extracted_at'], url=article['url'], source_id=source.id, media=[map_media(article['id'], m) for m in article['media']], category=article['category'], other_info=article['other_info'], veracity=article['veracity'], monitor_id=article['monitor_id'], monitor_name=article['monitor_name'])
async def call_avr_time(db: ExtendedDBManager, text, n=20): all_tasks = [ db.execute( Article.raw( sql= f"EXPLAIN ANALYSE SELECT * FROM article where name = '{text}';" )) for _ in range(n) ] res = await asyncio.gather(*all_tasks) time = [ float(r[0].replace("Execution Time: ", '').replace(" ms", '')) for row in list(res) for r in row._rows if r[0].startswith("Execution Time:") ] avr = mean(time) avr_g = 1.3 * avr avr_l = 0.7 * avr new_time = [t for t in time if avr_l <= t <= avr_g] return mean(new_time) if new_time else avr
async def index(request): art_count = 30 page = int(request.rel_url.query.get('page', 0)) if page == 1: raise web.HTTPFound('/') art_sql = Article.select().limit(art_count).offset(page * art_count) articles = await go(request, art_sql) co_sql = select([func.count(Article.c.id)]) count = await go(request, co_sql) count = count[0] // 30 + 1 pages = [x for x in range(1, count)] context = { 'h1': 'Спаршенный Блог о SEO', 'description': DESCRIPTION, 'keywords': KEYWORDS, 'articles': articles, 'pages': pages } response = aiohttp_jinja2.render_template('index.html', request, context) return response
def __extract_to_sql(self): """ Creates article table if not exists If url already exists in database, it will check if html content (raw_content) has changed Otherwise it will create new article Database sets for SQLite3. #TODO: hardcoded to SQLite3, get parameter from user """ # Bad practice for importing # But it's creating tables on import # TODO: create table when __extract_to_sql() function called from db import sql_session as sql is_exists = sql.query( exists().where(Article.url == self.article.url)).scalar() if is_exists: # TODO: redundant query count. is_exists should be combined with article variable. affects database performance. article = sql.query(Article).filter_by( url=self.article.url).first() if article.raw_content != self.article.raw_content: article.raw_content = self.article.raw_content article.content = self.article.content article.title = self.article.title article.meta_keywords = self.article.meta_keywords article.meta_description = self.article.meta_description article.images = json.dumps(self.article.images) sql.commit() else: article = Article(title=self.article.title, content=self.article.content, url=self.article.url, raw_content=self.article.raw_content, meta_description=self.article.meta_description, meta_keywords=self.article.meta_keywords, images=json.dumps(self.article.images)) sql.add(article) sql.commit()
async def drop_index(db, index="article_name"): try: await db.execute(Article.raw(sql=f"DROP INDEX IF EXISTS {index};")) except psycopg2.ProgrammingError as e: return True return False
def _process_record(self, item_arg): crawl_id, record = item_arg headers, content, url, date_crawled, content_type = record assert headers is not None assert content is not None assert url is not None assert date_crawled is not None assert content_type is not None status = "Processed" # Fix for a seg-fault if "nasa.gov" in url: return False # Sort out the domain domain_identifier = None logging.info("Retrieving domain...") domain_key = self.dc.get_Domain_key(url) while domain_identifier == None: domain_identifier = self.drw.get_domain(domain_key) domain = self._session.query(Domain).get(domain_identifier) assert domain is not None # Build database objects path = self.ac.get_path_fromurl(url) article = Article(path, date_crawled, crawl_id, domain, status) self._session.add(article) classified_by = self.swc.get_SoftwareVersion_fromstr(pysen.__VERSION__) assert classified_by is not None if content_type != 'text/html': logging.error("Unsupported content type: %s", str(content_type)) article.status = "UnsupportedType" return False # Start the async transaction to get the plain text worker_req_thread = BoilerPipeWorker(content) worker_req_thread.start() # Whilst that's executing, parse the document logging.info("Parsing HTML...") html = BeautifulSoup(content) if html is None or html.body is None: article.status = "NoContent" return False # Extract the dates date_dict = pydate.get_dates(html) if len(date_dict) == 0: status = "NoDates" # Detect the language lang, lang_certainty = langid.classify(content) # Wait for the BoilerPipe thread to complete worker_req_thread.join() logging.debug(worker_req_thread.result) logging.debug(worker_req_thread.version) if worker_req_thread.result == None: article.status = "NoContent" return False # If the language isn't English, skip it if lang != "en": logging.info("language: %s with certainty %.2f - skipping...", lang, lang_certainty) article.status = "LanguageError" # Replace with something appropriate return False content = worker_req_thread.result.encode('ascii', 'ignore') # Headline extraction h_counter = 6 headline = None while h_counter > 0: tag = "h%d" % (h_counter,) found = False for node in html.findAll(tag): if node.text in content: headline = node.text found = True break if found: break h_counter -= 1 # Run keyword extraction keywords = self.ex(content) kset = KeywordSet(self.stop_list) nnp_sets_scored = set([]) for word, freq, amnt in sorted(keywords): try: nnp_sets_scored.add((word, freq)) except ValueError: break nnp_adj = set([]) nnp_set = set([]) nnp_vector = [] for sentence in sent_tokenize(content): text = nltk.word_tokenize(sentence) pos = nltk.pos_tag(text) pos_groups = itertools.groupby(pos, lambda x: x[1]) for k, g in pos_groups: if k != 'NNP': continue nnp_list = [word for word, speech in g] nnp_buf = [] for item in nnp_list: nnp_set.add(item) nnp_buf.append(item) nnp_vector.append(item) for i, j in zip(nnp_buf[0:-1], nnp_buf[1:]): nnp_adj.add((i, j)) nnp_vector = filter(lambda x: x.lower() not in self.stop_list, nnp_vector) nnp_counter = Counter(nnp_vector) for word in nnp_set: score = nnp_counter[word] nnp_sets_scored.add((item, score)) for item, score in sorted(nnp_sets_scored, key=lambda x: x[1], reverse=True): try: if type(item) == types.ListType or type(item) == types.TupleType: kset.add(' '.join(item)) else: kset.add(item) except ValueError: break scored_nnp_adj = [] for item1, item2 in nnp_adj: score = nnp_counter[item1] + nnp_counter[item2] scored_nnp_adj.append((item1, item2, score)) nnp_adj = [] for item1, item2, score in sorted(scored_nnp_adj, key=lambda x: x[1], reverse=True): if len(nnp_adj) < KEYWORD_LIMIT: nnp_adj.append((item1, item2)) else: break # Generate list of all keywords keywords = set([]) for keyword in kset: try: k = Keyword(keyword) keywords.add(k) except ValueError as ex: logging.error(ex) continue for item1, item2 in nnp_adj: try: k = Keyword(item1) keywords.add(k) except ValueError as ex: logging.error(ex) try: k = Keyword(item2) keywords.add(k) except ValueError as ex: logging.error(ex) # Resolve keyword identifiers keyword_resolution_worker = KeywordResolutionWorker(set([k.word for k in keywords]), self.redis_kw) keyword_resolution_worker.start() # Run sentiment analysis trace = [] features = self.cls.classify(worker_req_thread.result, trace) label, length, classified, pos_sentences, neg_sentences,\ pos_phrases, neg_phrases = features[0:7] # Convert Pysen's model into database models try: doc = Document(article.id, label, length, pos_sentences, neg_sentences, pos_phrases, neg_phrases, headline) except ValueError as ex: logging.error(ex) logging.error("Skipping this document...") article.status = "ClassificationError" return False self._session.add(doc) extracted_phrases = set([]) for sentence, score, phrase_trace in trace: sentence_type = "Unknown" for node in html.findAll(text=True): if sentence.text in node.strip(): sentence_type = node.parent.name.upper() break if sentence_type not in ["H1", "H2", "H3", "H4", "H5", "H6", "P", "Unknown"]: sentence_type = "Other" label, average, prob, pos, neg, probs, _scores = score s = Sentence(doc, label, average, prob, sentence_type) self._session.add(s) for phrase, prob, score, label in phrase_trace: p = Phrase(s, score, prob, label) self._session.add(p) extracted_phrases.add((phrase, p)) # Wait for keyword resolution to finish keyword_resolution_worker.join() keyword_mapping = keyword_resolution_worker.out_keywords # Associate extracted keywords with phrases keyword_objects, short_keywords = kset.convert(keyword_mapping, self.kwc) for k in keyword_objects: self._session.merge(k) for p, p_obj in extracted_phrases: for k in keyword_objects: if k.word in p.get_text(): nk = KeywordIncidence(k, p_obj) # Save the keyword adjacency list for i, j in kset.convert_adj_tuples(nnp_adj, keyword_mapping, self.kwc): self._session.merge(i) self._session.merge(j) kwa = KeywordAdjacency(i, j, doc) self._session.add(kwa) # Build date objects for key in date_dict: rec = date_dict[key] if "dates" not in rec: logging.error("OK: 'dates' is not in a pydate result record.") continue dlen = len(rec["dates"]) if rec["text"] not in content: logging.debug("'%s' is not in %s", rec["text"], content) continue if dlen > 1: for date, day_first, year_first in rec["dates"]: try: dobj = AmbiguousDate(date, doc, day_first, year_first, rec["prep"], key) except ValueError as ex: logging.error(ex) continue self._session.add(dobj) elif dlen == 1: for date, day_first, year_first in rec["dates"]: dobj = CertainDate(date, doc, key) self._session.add(dobj) else: logging.error("'dates' in a pydate result set contains no records.") # Process links for link in html.findAll('a'): if not link.has_attr("href"): logging.debug("skipping %s: no href", link) continue process = True for node in link.findAll(text=True): if node not in worker_req_thread.result: process = False break if not process: logging.debug("skipping %s because it's not in the body text", link) break href, junk, junk = link["href"].partition("#") if "http://" in href: try: domain_id = None domain_key = self.dc.get_Domain_key(href) while domain_id is None: domain_id = self.drw.get_domain(domain_key) assert domain_id is not None href_domain = self._session.query(Domain).get(domain_id) except ValueError as ex: logging.error(ex) logging.error("Skipping this link") continue href_path = self.ac.get_path_fromurl(href) lnk = AbsoluteLink(doc, href_domain, href_path) self._session.add(lnk) logging.debug("Adding: %s", lnk) else: href_path = href try: lnk = RelativeLink(doc, href_path) except ValueError as ex: logging.error(ex) logging.error("Skipping link") continue self._session.add(lnk) logging.debug("Adding: %s", lnk) # Construct software involvment records self_sir = SoftwareInvolvementRecord(self.swc.get_SoftwareVersion_fromstr(self.__VERSION__), "Processed", doc) date_sir = SoftwareInvolvementRecord(self.swc.get_SoftwareVersion_fromstr(pydate.__VERSION__), "Dated", doc) clas_sir = SoftwareInvolvementRecord(self.swc.get_SoftwareVersion_fromstr(pysen.__VERSION__), "Classified", doc) extr_sir = SoftwareInvolvementRecord(self.swc.get_SoftwareVersion_fromstr(worker_req_thread.version), "Extracted", doc) for sw in [self_sir, date_sir, clas_sir, extr_sir]: self._session.merge(sw, load=True) logging.debug("Domain: %s", domain) logging.debug("Path: %s", path) article.status = status # Commit to database, return True on success try: self._session.commit() except OperationalError as ex: logging.error(ex) self._session.rollback() return None return article.id
driver.get(url) time.sleep(3) accept_cookies() time.sleep(3) article = session.query(Article).filter( Article.article_url == url).first() if article is None: article_title = driver.find_element_by_css_selector( "h1.article-title").text article_publication_date = datetime.datetime.strptime( driver.find_element_by_css_selector( "p.article-pubdate").text.strip(), "%d. %B %Y, %H:%M", ) article = Article(article_title, url, article_publication_date) session.add(article) session.commit() page_count = 1 if args.continue_article: if article.article_id != args.continue_article: logger.debug( f"Skipping article {article} with id {article.article_id}." ) continue last_posting_ref_id = get_last_crawled_posting_id_for_article( article.article_id) if last_posting_ref_id: page_count = go_to_page_with_posting_id( last_posting_ref_id, page_count)
async def worker(qu, coro_num, session, engine): loop = asyncio.get_running_loop() while True: if qu.qsize() == 0: break url = await qu.get() try: prox = random.choice(proxies_list) proxies = {'http': prox, 'https': prox} headers = {'User-Agent': random.choice(user_agents)} print(f'[Send request in {coro_num}] [queue_size {qu.qsize()}]', url) response = await session.get(url, headers=headers, timeout=10) if '/category/' in url: post_urls = response.html.xpath('//h3/a/@href') for u in post_urls: if u.endswith('.html'): if u not in articles: await qu.put(u) articles.add(u) continue post = {} name = response.html.xpath('//h1/text()')[0] post['name'] = await loop.run_in_executor(None, translate_text, name, 'ru', 'uk') post['slug'] = slugify(post['name']) post['source'] = url post['category'] = response.html.xpath( '//ul[@class="td-category"]/li/a/text()') post['category'] = ','.join(post['category']) post['image'] = response.html.xpath( '//div[@class="td-post-featured-image"]//img/@src')[0] elements = response.html.xpath('//p') post['content'] = '' post['parsed_time'] = datetime.now().date() for elem in elements: translated = await loop.run_in_executor( None, translate_text, elem.text, 'ru', 'uk') post['content'] += f'<p>{translated}</p>\n' del translated async with engine.acquire() as cursor: sql = Article.insert().values(**post) await cursor.execute(sql) print('[Article saved]', post["name"]) del url, prox, proxies, headers, response, post, sql except (ConnectionError, ReadTimeout): await qu.put(url) except KeyboardInterrupt: quit() except Exception as e: print(e, type(e), sys.exc_info()[2].tb_lineno)
url='Artificial-Intelligence', image='AI.jpeg') session.add(AI) session.commit() supervised_learning = Category(name='Supervised Learning', url='Supervised-Learning', topic_id=1, image='supervised.png') session.add(supervised_learning) session.commit() neural_networks = Article(name='Neural Networks', category_id=1, content='Artificial neural networks (ANN) or ' + 'connectionist systems are computing ' + 'systems vaguely inspired by the biological ' + 'neural networks that constitute ' + ' animal brains.') session.add(neural_networks) session.commit() unsupervised_learning = Category(name='Unsupervised Learning', url='Unupervised-Learning', topic_id=1, image='unsupervised.png') session.add(unsupervised_learning) session.commit() human_computer_interaction = Topic(name='Human Computer Interaction', url='Human-Computer-Interaction',