def get_texts_extra(project, set, user, pw, result_dict): from amcatclient import AmcatAPI from elasticsearch import Elasticsearch import string texts = [] conn = AmcatAPI('http://jamcat.mscc.huji.ac.il', user, pw) #es = Elasticsearch(['http://jamcat.mscc.huji.ac.il:9200']) es = Elasticsearch() for id, b in result_dict.items(): result = es.search(index="amcat", doc_type="article", body={"query": { "match": { "id": id } }}) try: r = result['hits']['hits'][0]['_source'] medium = r['medium'] date = r['date'] title = '' subtitle = '' letters = string.ascii_lowercase[0:len(b) + 1] for i in range(len(b)): text = re.sub(r'\(.*?\)', '', b[i]) id_ = str(id) + '_' + str(letters[i]) article = [id_, medium, date, title, subtitle, text] texts.append(article) except: print str(id) + ' is not in the database anymore' print len(texts), 'texts retrieved...' return texts
def get_ids(amcat_server: AmcatAPI, project: int, articleset: int) -> Iterable[int]: """ Get the article ids for this articleset :param amcat_server: Amcat server :param project: AmCAT project ID :param articleset: AmCAT Articleset ID :return: sequence of AmCAT article IDs """ return (x['id'] for x in amcat_server.get_articles( project, articleset, columns=['id']))
def process(amcat_server: AmcatAPI, project: int, articleset: int, nlpipe_server: Client, module: str, reset_error: bool = False, reset_started: bool = False, to_naf: bool = False) -> None: """ Process the given documents :param amcat_server: Amcat server (url str or AmCATAPI object) :param project: AmCAT project ID (int) :param articleset: AmCAT Articleset ID (int) :param nlpipe_server: NLPipe server (url/dirname str or nlpipe.Client object) :param module: NLPipe module name (str) :param reset_started: Re-set started documents to pending :param reset_error: Re-assign documents with errors :param to_naf: Assign as NAF documents with metadata (otherwise, assign as plain text) :param token: Token to use for authentication """ status = get_status(amcat_server, project, articleset, nlpipe_server, module) accept_status = {"UNKNOWN"} if reset_error: accept_status |= {"ERROR"} if reset_started: accept_status |= {"STARTED"} todo = [id for (id, status) in status.items() if status in accept_status] if todo: logging.info( "Assigning {} articles from {amcat_server} set {project}:{articleset}" .format(len(todo), **locals())) columns = 'headline,text,creator,date,url,uuid,medium,section,page' if args.naf else 'headline,text' for page in amcat_server.get_articles_by_id(articles=todo, columns=columns, page_size=100, yield_pages=True): args = page['results'] ids = [a['id'] for a in arts] texts = [_get_text(a, to_naf=args.naf) for a in arts] logging.debug("Assigning {} articles".format(len(ids))) nlpipe_server.bulk_process(args.module, texts, ids=ids, reset_error=reset_error, reset_pending=reset_started) logging.info("Done! Assigned {} articles".format(len(todo)))
def check_credentials(self, drop_token=True): if drop_token: token = None else: token = self.token_input or None if token or self.passwd_input: try: api = AmcatAPI(self.host_input, self.user_input, self.passwd_input, token) if api.token is None: api = None except (APIError, HTTPError) as e: logging.exception("Error on getting credentials") api = None else: api = None self.passwd_input = '' self.token_input = api and api.token self.save_credentials() self.api = api
parser.add_argument('--items_per_page', type=int, default=100) parser.add_argument('--start', type=int, default=0, help="Record number to start from (default=0)") parser.add_argument('project', type=int) parser.add_argument('articleset', help="Either the number of an existing set, or the name of a set to create") parser.add_argument('ppn', help="The ppn of the newspaper to scrape (see delpher.nl)") parser.add_argument('from_date', help="Starting date to scrape, format 31-12-1999") parser.add_argument('to_date', help="End date to scrape, format 31-12-1999") args = parser.parse_args() logging.basicConfig(format='[%(asctime)s %(levelname)s %(name)s:%(lineno)s %(threadName)s] %(message)s', level=logging.INFO) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.info("Connecting to AmCAT at {host}".format(**args.__dict__)) from amcatclient import AmcatAPI api = AmcatAPI(args.host) try: setid = int(args.articleset) except ValueError: logging.info("Creating new articleset {articleset} in project {project}".format(**args.__dict__)) prov = ("[{}] Scraped from delpher using delpher.py, ppn={ppn}, from_date={from_date}, to_date={to_date}" .format(datetime.now().isoformat(), **args.__dict__)) setid = api.create_set(project=args.project, name = args.articleset, provenance=prov)["id"] logging.info("Scraping ppn:{ppn} from {from_date}-{to_date} into set {project}:{set}" .format(set=setid, **args.__dict__)) scrape_delpher(api, args.project, setid, start=args.start, from_date=args.from_date, to_date=args.to_date, ppn=args.ppn)
link = a.get("href") links.append(link) return links if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='[%(asctime)s %(name)-12s %(levelname)-5s] %(message)s') parser = argparse.ArgumentParser() parser.add_argument("server", help="AmCAT host name", ) parser.add_argument("project", help="AmCAT project", ) parser.add_argument("articleset", help="AmCAT Articleset ID", type=int) parser.add_argument("--batchsize", help="Batch size for uploading to AmCAT", type=int, default=10) args = parser.parse_args() logging.basicConfig(level=logging.INFO, format='[%(asctime)s %(name)-12s %(levelname)-5s] %(message)s') logging.info(f"Scraping into AmCAT {args.articleset}") conn = AmcatAPI(args.server) page = range(0, 67, 1) for p in page: articles=[] page = get_page(p) links = get_links(page) for link in links: article = get_article(link) articles.append(article) conn.create_articles(args.project, args.articleset, articles)
arts = list(get_articles(urls)) logging.info("Adding {} articles to set {}:{}".format( len(arts), project, articleset)) conn.create_articles(project=project, articleset=articleset, json_data=arts) if __name__ == '__main__': from amcatclient import AmcatAPI import argparse parser = argparse.ArgumentParser() parser.add_argument('host', help='The AmCAT host to connect to, ' 'e.g. http://amcat.vu.nl') parser.add_argument('project', help='Project ID to add the articles to') parser.add_argument('query', help='Wikinews query for scraping') parser.add_argument('--username', help='Username for AmCAT login') parser.add_argument('--password', help='Password for AmCAT login') args = parser.parse_args() conn = AmcatAPI(args.host, args.username, args.password) category = "Iraq" articleset = conn.create_set( project=args.project, name="Wikinews articles for {}".format(args.query), provenance="Scraped from wikinews on {}".format( datetime.datetime.now().isoformat())) scrape_wikinews(conn, args.project, articleset['id'], args.query)
"url": url } def get_links(): for page in range(435, 734): url = URL_TEMPLATE.format(**locals()) print(url) page = requests.get(url) open("/tmp/test.html", "w").write(page.text) tree = html.fromstring(page.text) for article in tree.cssselect("#results .grid-element"): a, = article.cssselect('a.siteLink') link = a.get("href") if not link.startswith( "https://www.om.nl/actueel/nieuwsberichten"): continue locale.setlocale(locale.LC_ALL, 'nl_NL.UTF-8') date, = article.cssselect("div.iprox-content.iprox-date.date") date = date.text_content().split("-")[0].strip() #date = date.text_content().strip() date2 = datetime.datetime.strptime(date, "%d %B %Y") yield link, date2 from amcatclient import AmcatAPI conn = AmcatAPI("https://amcat.nl") for link, date in get_links(): a = scrape_pb(link, date) conn.create_articles(2088, 80277, [a])
def _amcat(amcat_server: Union[str, AmcatAPI]) -> AmcatAPI: return AmcatAPI(amcat_server) if isinstance(amcat_server, str) else amcat_server
parser.add_argument("target_url", help='URL of the target ' '(e.g. "http://amcat.vu.nl")') parser.add_argument("source_project", help='Article set ID in the source', type=int) parser.add_argument("source_set", help='Article set ID in the source', type=int) parser.add_argument("target_project", help='Project ID in the target', type=int) parser.add_argument("--target-set", "-s", help='Article set ID in the ' 'target (if omitted, a new set will be created', type=int) parser.add_argument("--batch-size", "-b", help='Batch size for copying', type=int, default=100) parser.add_argument("--from-page", "-p", help='Start from page (batch)', type=int, default=1) args = parser.parse_args() fmt = '[%(asctime)s %(levelname)s %(name)s] %(message)s' logging.basicConfig(format=fmt, level=logging.INFO) logging.getLogger("requests").setLevel(logging.WARNING) src = AmcatAPI(args.source_url) trg = AmcatAPI(args.target_url) copy_articles(src, args.source_project, args.source_set, trg, args.target_project, args.target_set, args.batch_size, args.from_page)
from lxml import html # Since we need to parse an English-language date (December 3, 2002), # set the locale to english. You can skip this step on an English-language OS import datetime import locale locale.setlocale(locale.LC_ALL, "en_US.utf8") # Import amcatclient from amcatclient import AmcatAPI # Connect to AmCAT. # Note: if you create a .amcatauth file in your home dir, there is no # need to specify username and password. conn = AmcatAPI("http://amcat.vu.nl", "<username>","<password>") # Create a new articleset to add the articles to. # You can also just set 'setid' to add to an existing set PROJECT_ID = 1 aset = conn.create_set(project=PROJECT_ID, name="State of the Union", provenance="Scraped from http://www.presidency.ucsb.edu/sou.php") setid = aset["id"] # Get the main page and iterate over all links in a 'doclist' page = requests.get('http://www.presidency.ucsb.edu/sou.php') tree = html.fromstring(page.text) for a in tree.cssselect("td.doclist a"): # Skip empty links and the 'jump to menu' link
import re from amcatclient import AmcatAPI # Connect to AmCAT parser = argparse.ArgumentParser() parser.add_argument('host', help='The AmCAT host to connect to, ' 'e.g. http://amcat.vu.nl') parser.add_argument('project', help='The project to count words in') parser.add_argument('articleset', help='The article set to count words in') parser.add_argument('--username', help='Username for AmCAT login') parser.add_argument('--password', help='Password for AmCAT login') args = parser.parse_args() conn = AmcatAPI(args.host, args.username, args.password) # Iterate over the articles, count all words counts = collections.Counter() for a in conn.list_articles(args.project, args.articleset): # get words by splitting lowercased text on non-word characters text = a['text'].lower() words = re.split("\W+", text) counts.update(words) # delete all words with <= 3 characters for word in counts.keys(): if len(word) <= 3: del counts[word] # print most common words
date=date, text=text, publisher="DWDD") def get_article_text(url): page = requests.get(url) page.raise_for_status() tree = html.fromstring(page.text) text = "\n\n".join( t.text_content() for t in tree.cssselect(".sc-1fnykkm-0.gxieGH,h1.di2x5p-0")) return text conn = AmcatAPI("http://localhost:8000") if len(sys.argv) > 1: end_cursor = sys.argv[1] else: articles = get_first() amcat_articles = list(parse_articles(articles)) conn.create_articles(1, 101, amcat_articles) # end_cursor = amcat_articles[-1]['dwdd_id'] for i in count(): # print(f"{i}: {end_cursor}") articles = get_articles(end_cursor) amcat_articles = list(parse_articles(articles)) conn.create_articles(1, 101, amcat_articles) # end_cursor = amcat_articles[-1]['dwdd_id']
import collections import re from amcatclient import AmcatAPI # Connect to AmCAT parser = argparse.ArgumentParser() parser.add_argument('host', help='The AmCAT host to connect to, ' 'e.g. http://amcat.vu.nl') parser.add_argument('project', help='The project to count words in') parser.add_argument('articleset', help='The article set to count words in') parser.add_argument('--username', help='Username for AmCAT login') parser.add_argument('--password', help='Password for AmCAT login') args = parser.parse_args() conn = AmcatAPI(args.host, args.username, args.password) # Iterate over the articles, count all words counts = collections.Counter() for a in conn.list_articles(args.project, args.articleset): # get words by splitting lowercased text on non-word characters text = a['text'].lower() words = re.split("\W+", text) counts.update(words) # delete all words with <= 3 characters for word in counts.keys(): if len(word) <= 3: del counts[word] # print most common words
type=int) parser.add_argument("amcat_set", help="Article Set ID in AmCAT server", type=int) parser.add_argument("login", help="Login LN") parser.add_argument("password", help="Password LN") parser.add_argument("medium", help="Medium to get articles from") parser.add_argument("from_date", help="Date from which to get articles") parser.add_argument("to_date", help="Date to which to get articles") parser.add_argument("query", help="searchstring") parser.add_argument( "--geckodriver", help="Path of geckodriver executable (default=~/geckodriver)") args = parser.parse_args() from amcatclient import AmcatAPI conn = AmcatAPI(args.amcat_host) driver_path = args.geckodriver or str(Path.home() / "geckodriver") driver = webdriver.Firefox(executable_path=driver_path) from_date = datetime.strptime(args.from_date, "%Y-%m-%d") to_date = datetime.strptime(args.to_date, "%Y-%m-%d") login_nexis(driver, args.login, args.password) for page in scrape_nexis(driver, args.medium, from_date, to_date, args.query): conn.create_articles(args.amcat_project, args.amcat_set, page)
# License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### import argparse from amcatclient import AmcatAPI # Connect to AmCAT parser = argparse.ArgumentParser() parser.add_argument('host', help='The AmCAT host to connect to, ' 'e.g. http://amcat.vu.nl') parser.add_argument('--username', help='Username for AmCAT login') parser.add_argument('--password', help='Password for AmCAT login') args = parser.parse_args() conn = AmcatAPI(args.host, args.username, args.password) articles = [{ "headline": "test headline3", "medium": "test medium", "text": "test text", "date": "2013-01-01" }, { "headline": "test headline4", "medium": "test medium", "text": "test text", "date": "2013-01-01" }] aset = api.create_set(project=1, name="Testset", provenance="test data") articles = api.create_articles(project=1,
"-t", help="Provide auth token" "(default reads ./.nlpipe_token or NLPIPE_TOKEN") args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.verbose else logging.INFO, format='[%(asctime)s %(name)-12s %(levelname)-5s] %(message)s') logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("amcatclient").setLevel(logging.INFO) logging.debug( "Will {args.action} at nlpipe {args.nlpipeserver} all articles " "from {args.amcatserver} set {args.articleset}".format(**locals())) amcatserver = AmcatAPI(args.amcatserver) nlpipeserver = get_client(args.nlpipeserver, args.token) if args.action == "process": process(amcatserver, args.project, args.articleset, nlpipeserver, args.module, args.reset_error, args.reset_started) if args.action == "process_pipe": process_pipe(amcatserver, args.project, args.articleset, nlpipeserver, args.module, "alpinonerc") if args.action == "status": status = get_status(amcatserver, args.project, args.articleset, nlpipeserver, args.module) for k, v in Counter(status.values()).items(): print("{k}: {v}".format(**locals())) if args.action == 'result': results = get_results(amcatserver,
for post in posts: # is hetzelfde als de title,= notatie #titles = post.cssselect("h2") #if len(titles) != 1: # raise Exception("Boe") #title = titles[0] link, = post.cssselect("h2 > a") href = link.get("href") if not href.startswith("https://www.pvda.nl/nieuws/"): continue else: headline = link.text_content().strip() meta, = post.cssselect("span.meta") datestr = meta.text_content() m = re.match(r"(\d+ \w+ \d{4})", datestr.strip()) if not m: raise ValueError(f"Cannot prase date: {datestr}") datestr2 = m.group(1) date = datetime.strptime(datestr2, "%d %B %Y") yield date, headline, href #a = scrape_pb("/actueel/nieuws/2019/03/04/reactie-minister-blok-op-het-terugroepen-van-de-nederlandse-ambassadeur-uit-iran") #print(a) #sys.exit() from amcatclient import AmcatAPI conn = AmcatAPI("https://amcat.nl") for date, headline, href in get_links(): a = scrape_pb(href, date, headline) conn.create_articles(2051, 80339, [a])
password = fbcredentials.password from_date = datetime.strptime(args.fromdate, "%Y-%m-%d") if args.fromdate else None to_date = datetime.strptime(args.todate, "%Y-%m-%d") if args.todate else None logging.info(f"Logging in to facebook as {username}") scraper = FBPostScraper(username, password) try: posts = scraper.get_posts(args.page, max_scrolls=args.max_scrolls, date_from=from_date, date_to=to_date) if args.amcathost: conn = AmcatAPI(args.amcathost) buffer = [] for p in posts: buffer.append(p) if len(buffer) >= 10: logging.info( f"Saving {len(buffer)} articles to {args.amcathost} project {args.project} set {args.set}" ) conn.create_articles(project=args.project, articleset=args.set, json_data=buffer) buffer = [] if buffer: conn.create_articles(project=args.project, articleset=args.set, json_data=buffer)
"date": date3, "medium": "Persberichten", "url": url } def get_links(): for page in range(1, 2): url = URL_TEMPLATE.format(**locals()) print(url) page = requests.get(url) tree = html.fromstring(page.text) links = list(tree.cssselect('h5.kamervraag-panel-title-col a')) for a in links: l = a.get("href") link = URL_ROOT + l print(link) #if not link.startswith("/actueel/"): # raise ValueError("Not a persbericht? {link}".format(**locals())) yield link #a = scrape_pb("/actueel/nieuws/2019/03/04/reactie-minister-blok-op-het-terugroepen-van-de-nederlandse-ambassadeur-uit-iran") #print(a) #sys.exit() from amcatclient import AmcatAPI conn = AmcatAPI("http://localhost:8000") for link in get_links(): a = scrape_pb(link) conn.create_articles(2, 42, [a])