# See the License for the specific language governing permissions and # limitations under the License. """Get profile picture from Twitter profiles""" import json import requests import urllib import sling import sling.flags as flags import sling.log as log import sling.task.data as data from sling.task.workflow import * flags.define("--twitterdb", help="database for storing Twitter profiles", default="http://localhost:7070/twitter", metavar="DBURL") bad_images = set( ["http://pbs.twimg.com/profile_images/1302121919014207490/KaYYEC8b.jpg"]) # Task for extracting images from Twitter profiles. class TwitterExtract: def run(self, task): # Get parameters. twitterdb = task.input("twitterdb").name # Load knowledge base. log.info("Load knowledge base") kb = sling.Store()
# See the License for the specific language governing permissions and # limitations under the License. """Run SLING command""" import importlib import subprocess import sys import time import sling import sling.flags as flags import sling.log as log import sling.task.workflow as workflow # Command-line flags. flags.define("COMMAND", help="commands(s) to perform", default=[], nargs="*") flags.define("-l", "--list", help="list commands", default=False, action="store_true") flags.define("--spawn", help="run command in background", default=False, action="store_true") flags.define("--version", help="print version information", default=False,
# limitations under the License. """Run SLING processing""" import sling import sling.flags as flags import sling.log as log import sling.task.corpora as corpora import sling.task.download as download import sling.task.wiki as wiki import sling.task.embedding as embedding import sling.task.entity as entity import sling.task.workflow as workflow # Command-line flags. flags.define("--download_wikidata", help="download wikidata dump", default=False, action='store_true') flags.define("--download_wikipedia", help="download wikipedia dump(s)", default=False, action='store_true') flags.define("--import_wikidata", help="convert wikidata to sling format", default=False, action='store_true') flags.define("--import_wikipedia", help="convert wikidata dump(s) to sling format", default=False,
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Numerical gradient checking.""" import sling import sling.myelin as myelin import sling.flags as flags import numpy as np import math flags.define("--fp64", default=False, action='store_true') flags.define("--seed", default=None) flags.define("--test") flags.define("--dump_data", default=False, action='store_true') flags.parse() compiler = myelin.Compiler() if flags.arg.seed: np.random.seed(int(flags.arg.seed)) shape = [16] dtype = myelin.DT_FLOAT nptype = np.float32 if flags.arg.fp64: dtype = "float64" nptype = np.float64
"""Fetch media files and store in media cache database.""" import email.utils import datetime import hashlib import requests import sys import traceback import urllib import sling import sling.flags as flags flags.define("--kb", default="data/e/kb/kb.sling", help="Knowledge base with media references") flags.define("--mediadb", default="http://localhost:7070/media", help="Media database") flags.define("--max_media_size", help="Maximum media file size", default=63*1024*1024, type=int, metavar="SIZE") flags.define("--blacklist", default="local/media-blacklist.txt", help="List of blacklisted media files")
# # http:#www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Workflow builder for Wikidata and Wikipedia processing""" from workflow import * import corpora import sling.flags as flags flags.define("--index", help="index wiki data sets", default=False, action='store_true') flags.define("--only_primary_language", help="only use wikidata labels from primary language", default=False, action='store_true') flags.define("--only_known_languages", help="only use wikidata labels from known languages", default=False, action='store_true') flags.define("--snapshot_kb", help="create snapshot for knowledge base", default=False,
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Numerical gradient checking.""" import sling import sling.myelin as myelin import sling.flags as flags import numpy as np import math flags.define("--fp64", default=False, action='store_true') flags.parse() compiler = myelin.Compiler() shape = [16] dtype = myelin.DT_FLOAT nptype = np.float32 if flags.arg.fp64: dtype = "float64" nptype = np.float64 # Compute number of elements in shape. def elements(shape): n = 1 for d in shape: n *= d
# See the License for the specific language governing permissions and # limitations under the License. """Run SLING processing""" import sling import sling.flags as flags import sling.log as log import sling.task.corpora as corpora import sling.task.download as download import sling.task.wiki as wiki import sling.task.embedding as embedding import sling.task.workflow as workflow # Command-line flags. flags.define("--download_wikidata", help="download wikidata dump", default=False, action='store_true') flags.define("--download_wikipedia", help="download wikipedia dump(s)", default=False, action='store_true') flags.define("--import_wikidata", help="convert wikidata to sling format", default=False, action='store_true') flags.define("--import_wikipedia", help="convert wikidata dump(s) to sling format", default=False,
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Class defining a dashboard of the status of Sling updates to WikiData.""" import pywikibot import sling import sling.flags as flags import glob flags.define("--test", help="use test record file", default=False, action='store_true') precision_map = { sling.MILLENNIUM: pywikibot.WbTime.PRECISION['millenia'], sling.CENTURY: pywikibot.WbTime.PRECISION['century'], sling.DECADE: pywikibot.WbTime.PRECISION['decade'], sling.YEAR: pywikibot.WbTime.PRECISION['year'], sling.MONTH: pywikibot.WbTime.PRECISION['month'], sling.DAY: pywikibot.WbTime.PRECISION['day'] } class WikiMonitor: def __init__(self): self.site = pywikibot.Site("wikidata", "wikidata") self.repo = self.site.data_repository()
break if other_qid != qid and other_qid not in seen: seen.add(other_qid) self._text(other_category.name) self._form_anchor(" (= %s)" % other_qid, other_qid) self._text(" (%0.4f)" % other_parse.score) self._br() self._end("td") self._end("tr") self._end("table") if __name__ == "__main__": flags.define("--port", help="port number for the HTTP server", default=8001, type=int, metavar="PORT") flags.define( "--parses", help="Recordio of category parses", default="local/data/e/wikicat/parses-with-match-statistics.rec", type=str, metavar="FILE") flags.parse() log.info('Reading parses from %s' % flags.arg.parses) browser_globals.read(flags.arg.parses) server_address = ('', flags.arg.port) httpd = HTTPServer(server_address, Browser) log.info('Starting HTTP Server on port %d' % flags.arg.port) httpd.serve_forever()
break if other_qid != qid and other_qid not in seen: seen.add(other_qid) self._text(other_category.name) self._form_anchor(" (= %s)" % other_qid, other_qid) self._text(" (%0.4f)" % other_parse.score) self._br() self._end("td") self._end("tr") self._end("table") if __name__ == "__main__": flags.define("--port", help="port number for the HTTP server", default=8001, type=int, metavar="PORT") flags.define("--parses", help="Recordio of category parses", default="local/data/e/wikicat/parses-with-match-statistics.rec", type=str, metavar="FILE") flags.define("--output", help="Output dir where Wikibot recordios will be generated.", default="local/data/e/wikicat/", type=str, metavar="DIR") flags.parse() log.info('Reading parses from %s' % flags.arg.parses) browser_globals.init(flags.arg.parses, flags.arg.output)
def attach_fact_matches(self, input_parses): with self.wf.namespace("attach-fact-matches"): matcher = self.wf.task("category-parse-fact-matcher") self.kb_input(matcher) matcher.attach_input("parses", input_parses) output = self.wf.resource( "parses-with-match-statistics.rec", \ dir=self.outdir, format="records/frame") matcher.attach_output("output", output) return output if __name__ == '__main__': flags.define("--port", help="port number for task monitor (0 means no monitor)", default=6767, type=int, metavar="PORT") flags.define("--output", help="Output directory", default="data/e/wikicat", type=str, metavar="DIR") flags.define("--lang", help="Language to process", default="en", type=str, metavar="LANG") flags.define("--min_members", help="Reject categories with less than these many members", default=5,
""" Fetch the Danish Company Registry (CVR) and store the records in a database. """ import sys import requests import json import sling import sling.flags as flags flags.define("--apikey", help="CVR API key file", default="local/keys/cvr.txt", metavar="FILE") flags.define("--start", help="Start time for fetching CVR updates", default=None, metavar="YYYY-MM-DD") flags.define("--end", help="End time for fetching CVR updates", default=None, metavar="YYYY-MM-DD") flags.define("--cvrdb", help="database for storing CVR records", default="http://localhost:7070/cvr", metavar="DBURL") flags.parse()
# http:#www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Class for updating wikidata with extracted facts from a record file.""" import pywikibot import sling import json import sys import datetime import sling.flags as flags flags.define("--first", help="first record to update", default=0, type=int) flags.define("--last", help="last record to update", default=sys.maxsize, type=int) flags.define("--test", help="use test record file", default=False, action='store_true') flags.define("--batch", help="number of records to update", default=3, type=int)
["/trace/_str"]) # There should be the same number of actions in the step. checker.check_eq(len(base_actions), len(expt_actions), \ "Step %d: # of actions" % i) # There should be the same number of steps. checker.check_eq(len(base_steps), len(expt_steps), "# of Steps") base_reader.close() expt_reader.close() flags.define('--base', help='Base recordio', default="", type=str, metavar='FILE') flags.define('--expt', help='Expt recordio', default="", type=str, metavar='FILE') flags.define('--commons', help='Commons', default="", type=str, metavar='FILE') flags.define('--diff', help='File where sample diff (if any) will be written', default="/tmp/diff.txt",
# # http:#www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Corpus locations""" import os import sling.flags as flags # Command-line flags. flags.define("--language", help="primary language for resources", default="en", metavar="LANG") flags.define("--languages", help="list of languages to process", metavar="LANG,...") flags.define("--wikidata", help="wikidata version", default="latest", metavar="YYYYMMDD") flags.define("--wikipedia", help="wikipedia version", default="latest", metavar="YYYYMMDD")
# Prints evaluation metrics. def print_metrics(header, metrics): print "\n", header, "metrics" print "-" * (len(header) + len("metrics") + 1) for metric in ['SPAN', 'FRAME', 'TYPE', 'ROLE', 'SLOT']: for name in ['Precision', 'Recall', 'F1']: key = metric + "_" + name print " %s: %f" % (key, metrics[key]) print if __name__ == '__main__': flags.define('--flow', help='Flow file', default='', type=str, metavar='FLOW') flags.define('--strip', help='Output flow file which drops "dev" blobs', default='', type=str, metavar='FLOW') flags.define('--training_details', help='Print training details or not', default=False, action='store_true') flags.define('--output_commons', help='Output file to store commons', default='', type=str,
# # http:#www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Datasets shared across workflows.""" import sling.flags as flags import sling.task.corpora as corpora from sling.task import * flags.define("--extra_items", help="additional items with info", default=None, metavar="RECFILES") class Datasets: def __init__(self, wf): self.wf = wf #--------------------------------------------------------------------------- # Repository #--------------------------------------------------------------------------- def language_defs(self): """Resource for language definitions. This defines the /lang/<lang> symbols and has meta information for each language.""" return self.wf.resource("languages.sling",
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Compare Myelin flow computations with NumPy.""" import sling import sling.flags as flags import sling.myelin as myelin import sling.myelin.simulator as simulator import numpy as np import sys import struct flags.define("--dt", default=myelin.DT_FLOAT) flags.define("--test") flags.define("--thorough", default=False, action='store_true') flags.define("--repeat", default=1, type=int) flags.define("--skipdiff", default=False, action='store_true') flags.parse() dt = flags.arg.dt print("Myelin test suite for", dt, flags.arg.cpu) print() # Statistics for test runs. class Test: def __init__(self, f):
gold = document.gold for index, cascade in enumerate(cascades): cascade_gold_sequence = cascade.translate(gold) delegate = 0 cost = 0 for cascade_gold in cascade_gold_sequence: cost += cascade.delegates[delegate].size() counts[index][delegate] += 1 if cascade_gold.is_cascade(): delegate = cascade_gold.delegate else: delegate = 0 costs[index] += cost for c, cost, cascade in zip(counts, costs, cascades): print "\n", cascade.__class__.__name__, "cost =", cost, "\n", \ "Delegate invocations:", c, "\n", cascade if __name__ == '__main__': import sling.flags as flags flags.define('--commons', help='Commons store', default='', type=str) flags.define('--input', help='Input corpora', default='', type=str) flags.parse() print_cost_estimates(flags.arg.commons, flags.arg.input)
# # http:#www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Workflow for Wikidata and Wikipedia processing""" import sling.flags as flags from sling.task import * import sling.task.corpora as corpora flags.define("--index", help="index wiki data sets", default=False, action='store_true') flags.define("--only_primary_language", help="only use wikidata labels from primary language", default=False, action='store_true') flags.define("--only_known_languages", help="only use wikidata labels from known languages", default=False, action='store_true') flags.define("--skip_wikipedia_mapping", help="skip wikipedia mapping step", default=False,
import http.cookiejar import json import os import sys import re import requests import time import traceback import urllib.parse from threading import Thread from queue import Queue import sling import sling.flags as flags flags.define("--crawldb", help="database for crawled news articles", default="http://localhost:7070/crawl", metavar="URL") flags.define("--newssites", default="data/crawl/newssites.txt", help="list of news sites") flags.define("--cookiedir", default="local/cookies", help="directory for site-specific cookies") flags.define("--threads", help="number of thread for crawler worker pool", default=10, type=int, metavar="NUM")
# Returns true if 'filename' appears in the list of ids in 'allowed_ids'. def file_allowed(allowed_ids, filename): if len(allowed_ids) == 0: return True _, sep, suffix = filename.partition('data/english/annotations') filename = sep + suffix return filename in allowed_ids if __name__ == "__main__": import os import sys flags.define('--input', help='CONLL folder name ending in "annotations"', default='', type=str) flags.define('--output', help='Output recordio file', default='/tmp/output.rec', type=str) flags.define('--max', help='Maximum number of files to process (-1 for all)', default=-1, type=int) flags.define('--summary', help='Output file where the summary will be written.', default='', type=str) flags.define( '--constituency_schema',
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Fetch profile information from Twitter""" import json import requests import sys import time import tweepy import urllib import sling import sling.flags as flags flags.define("--apikeys", default="local/keys/twitter.json", help="Twitter API key file") flags.define("--twitterdb", help="database for storing Twitter profiles", default="http://localhost:7070/twitter", metavar="DBURL") flags.define("--mediadb", help="database for storing Twitter profiles pictures", default=None, metavar="DBURL") flags.define("--update", help="refresh all updated profiles", default=False,
# limitations under the License. """Monitor Wikimedia change stream and update Wikidata database.""" import json import re import sys import requests import time from threading import Thread from queue import Queue import sling import sling.flags as flags from sling.crawl.sse import SSEStream flags.define("--wiki_changes_stream", help="stream for monitoring updates to wikidata", default="https://stream.wikimedia.org/v2/stream/recentchange", metavar="URL") flags.define("--since", help="retrieve event starting from a specific time", default=None, metavar="YYYY-MM-DDThh:mm:ssZ") flags.define("--wiki_fetch_url", help="url for fetching items from wikidata", default="https://www.wikidata.org/wiki/Special:EntityData", metavar="URL") flags.define("--dburl", help="wiki database url for collecting changes", default="http://localhost:7070/wikidata",
import praw import json import traceback import sys import time import sling.crawl.news as news import sling.flags as flags flags.define("--apikeys", default="local/keys/reddit.json", help="Reddit API key file") flags.parse() # Consider all submission to these subreddits as news articles. news_reddits = [ "AutoNewspaper", "nofeenews", "newsdk", "news", "Full_news", "qualitynews", "worldnews", "worldevents", ] # Ignored subreddits. ignored_reddits = [ "u_toronto_news", "newsokur", ]
Fetch the Companies House company registry using the streaming API. """ import json import requests import sling import sling.flags as flags import sling.crawl.chs as chs import sys import time import traceback from threading import Thread from queue import Queue flags.define("--chskeys", help="Companies House API key file", default="local/keys/chs.txt", metavar="FILE") flags.define("--chsdb", help="database for storing Companies House records", default="http://localhost:7070/chs", metavar="DBURL") flags.define("--checkpoint", help="File with latest checkpoint", default=None, metavar="FILE") flags.define("--checkpoint_interval", help="How often checkpoint is written to disk (seconds)", default=60,
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Workflow for silver-labeling of Wikipedia articles""" import os import sling.flags as flags import sling.task.corpora as corpora import sling.task.data as data from sling.task import * flags.define("--silver_corpus_size", help="maximum number of documents in silver corpus", default=None, type=int, metavar="NUM") flags.define("--decoder", help="parser decoder type", default="knolex") flags.define("--simple_types", help="use simple commons store with basic types", default=False, action="store_true") flags.define("--subwords", help="use subword tokenization", default=False,
# See the License for the specific language governing permissions and # limitations under the License. """Workflows for downloading wiki dumps and datasets""" import os from urllib.request import urlopen import time import sling import sling.task.corpora as corpora import sling.flags as flags import sling.log as log from sling.task.workflow import * flags.define("--dataurl", help="data set site", default="https://ringgaard.com/data", metavar="URL") flags.define("--dataset", help="list of datasets to fetch", default="", metavar="LIST") # Number of concurrent downloads. download_concurrency = 0 # Task for downloading files. class UrlDownload: def run(self, task): # Get task parameters.
import datetime import requests import sys import collections import xml.etree.ElementTree as ET import sling.flags as flags import sling.crawl.dnscache import sling.crawl.news as news flags.define("--daily", default=False, action="store_true", help="fetch daily news feed from newslookup.com") flags.define("--hourly", default=False, action="store_true", help="fetch hourly news feed from newslookup.com") flags.define("--newsites", default=False, action="store_true", help="output new unknown news sites") flags.define("--file", default=None, help="fetch news articles from newslookup file") flags.define("--backupdir", default=None, help="backup directory for newslookup files")