def test_replacement(self): s = [0] * 50 + [1] * 50 c1 = np.array(s).reshape((100, 1)) s = [0] * 5 + [1] * 5 + [2] * 90 c2 = np.array(s).reshape((100, 1)) x = np.hstack([c1, c2]) domain = data.Domain( [ data.ContinuousVariable("a"), data.DiscreteVariable("b", values="ABC") ], data.ContinuousVariable("c"), ) table = Table(domain, x, c1) for col, computed_value in ((0, 0.5), (1, 2)): var1 = preprocess.Average()(table, col) self.assertIsInstance(var1.compute_value, preprocess.ReplaceUnknowns) self.assertEqual(var1.compute_value.value, computed_value)
def test_replacement(self): nan = np.nan X = [ [1.0, nan, 0.0], [2.0, 1.0, 3.0], [nan, nan, nan] ] domain = data.Domain( (data.DiscreteVariable("A", values=["0", "1", "2"]), data.ContinuousVariable("B"), data.ContinuousVariable("C")) ) table = data.Table.from_numpy(domain, np.array(X)) v1 = impute.AsValue()(table, domain[0]) self.assertTrue(np.all(np.isfinite(v1.compute_value(table)))) self.assertTrue(np.all(v1.compute_value(table) == [1., 2., 3.])) self.assertEqual([v1.str_val(v) for v in v1.compute_value(table)], ["1", "2", "N/A"]) v1, v2 = impute.AsValue()(table, domain[1]) self.assertTrue(np.all(np.isfinite(v1.compute_value(table)))) self.assertTrue(np.all(np.isfinite(v2.compute_value(table)))) self.assertTrue(np.all(v2.compute_value(table) == [0., 1., 0.])) self.assertEqual([v2.str_val(v) for v in v2.compute_value(table)], ["undef", "def", "undef"]) vars = reduce(lambda acc, v: acc + (list(v) if isinstance(v, (tuple, list)) else [v]), [impute.AsValue()(table, var) for var in table.domain], []) domain = data.Domain(vars) idata = table.from_table(domain, table) np.testing.assert_allclose( idata.X, [[1, 1.0, 0, 0.0, 1], [2, 1.0, 1, 3.0, 1], [3, 1.0, 0, 1.5, 0]] )
def test_replacement(self): nan = np.nan X = [[1.0, nan, 0.0], [2.0, 1.0, 3.0], [nan, nan, nan]] unknowns = np.isnan(X) domain = data.Domain( (data.DiscreteVariable("A", values=("0", "1", "2")), data.ContinuousVariable("B"), data.ContinuousVariable("C"))) table = data.Table.from_numpy(domain, np.array(X)) for i in range(0, 3): v = impute.Random()(table, domain[i]) self.assertTrue(np.all(np.isfinite(v.compute_value(table)))) imputer = preprocess.Impute(method=impute.Random()) itable = imputer(table) self.assertTrue(np.all(np.isfinite(itable.X))) # Original data should keep unknowns self.assertTrue(np.all(unknowns == np.isnan(table.X))) self.assertTrue(np.all(itable.X[~unknowns] == table.X[~unknowns]))
def test_find_compatible_unordered(self): gend = data.DiscreteVariable("gend", values=["F", "M"]) find_comp = data.DiscreteVariable.find_compatible self.assertIs(find_comp("gend"), gend) self.assertIs(find_comp("gend", values=["F"]), gend) self.assertIs(find_comp("gend", values=["F", "M"]), gend) self.assertIs(find_comp("gend", values=["M", "F"]), gend) # Incompatible since it is ordered self.assertIsNone(find_comp("gend", values=["M", "F"], ordered=True)) self.assertIsNone(find_comp("gend", values=["F", "M"], ordered=True)) self.assertIsNone(find_comp("gend", values=["F"], ordered=True)) self.assertIsNone(find_comp("gend", values=["M"], ordered=True)) self.assertIsNone(find_comp("gend", values=["N"], ordered=True)) # Incompatible due to empty intersection self.assertIsNone(find_comp("gend", values=["N"])) # Compatible, adds values self.assertIs(find_comp("gend", values=["F", "N", "R"]), gend) self.assertEqual(gend.values, ["F", "M", "N", "R"])
def test_replacement(self): from Orange.classification import MajorityLearner, SimpleTreeLearner from Orange.regression import MeanLearner nan = np.nan X = [[1.0, nan, 0.0], [2.0, 1.0, 3.0], [nan, nan, nan]] unknowns = np.isnan(X) domain = data.Domain( (data.DiscreteVariable("A", values=["0", "1", "2"]), data.ContinuousVariable("B"), data.ContinuousVariable("C"))) table = data.Table.from_numpy(domain, np.array(X)) v = impute.Model(MajorityLearner())(table, domain[0]) self.assertTrue(np.all(np.isfinite(v.compute_value(table)))) self.assertTrue( np.all(v.compute_value(table) == [1., 2., 1.]) or np.all(v.compute_value(table) == [1., 2., 2.])) v = impute.Model(MeanLearner())(table, domain[1]) self.assertTrue(np.all(np.isfinite(v.compute_value(table)))) self.assertTrue(np.all(v.compute_value(table) == [1., 1., 1.])) imputer = preprocess.Impute(impute.Model(SimpleTreeLearner())) itable = imputer(table) # Original data should keep unknowns self.assertTrue(np.all(np.isnan(table.X) == unknowns)) self.assertTrue(np.all(itable.X[~unknowns] == table.X[~unknowns])) Aimp = itable.domain["A"].compute_value self.assertIsInstance(Aimp, impute.ReplaceUnknownsModel) col = Aimp(table) self.assertEqual(col.shape, (len(table), )) self.assertTrue(np.all(np.isfinite(col))) v = Aimp(table[-1]) self.assertEqual(v.shape, (1, )) self.assertTrue(np.all(np.isfinite(v)))
def test_sparse_get_distributions(self): def assert_dist_and_unknowns(computed, goal_dist): nonlocal d goal_dist = np.array(goal_dist) sum_dist = np.sum(goal_dist[1, :] if goal_dist.ndim == 2 else goal_dist) n_all = np.sum(d.W) if d.has_weights() else len(d) assert_dist_almost_equal(computed, goal_dist) self.assertEqual(computed.unknowns, n_all - sum_dist) domain = data.Domain([ data.DiscreteVariable("d%i" % i, values=tuple("abc")) for i in range(10) ] + [data.ContinuousVariable("c%i" % i) for i in range(10)]) # pylint: disable=bad-whitespace X = sp.csr_matrix( # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 # -------------------------------------------------------------------------------- [[0, 2, 0, 2, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 1, 0, 2, np.nan, 2, 0], [ 0, 0, 1, 1, np.nan, np.nan, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 1, np.nan, 0, 0 ], [0, 0, 0, 1, 0, 2, np.nan, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1.1, 0, 0, 0, 0, 0, 0]]) warnings.filterwarnings("ignore", ".*", sp.SparseEfficiencyWarning) X[0, 0] = 0 d = data.Table.from_numpy(domain, X) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) zeros = [5, 0, 0] assert_dist_and_unknowns(ddist[0], zeros) assert_dist_and_unknowns(ddist[1], [4, 0, 1]) assert_dist_and_unknowns(ddist[2], [3, 1, 1]) assert_dist_and_unknowns(ddist[3], [2, 2, 1]) assert_dist_and_unknowns(ddist[4], [3, 1, 0]) assert_dist_and_unknowns(ddist[5], [2, 1, 1]) assert_dist_and_unknowns(ddist[6], [1, 2, 1]) assert_dist_and_unknowns(ddist[7], zeros) assert_dist_and_unknowns(ddist[8], [4, 0, 1]) assert_dist_and_unknowns(ddist[9], [4, 1, 0]) zeros = [[0], [5]] assert_dist_and_unknowns(ddist[10], zeros) assert_dist_and_unknowns(ddist[11], zeros) assert_dist_and_unknowns(ddist[12], zeros) assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [3, 1, 1]]) assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [3, 1, 1]]) assert_dist_and_unknowns(ddist[15], zeros) assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [3, 1, 1]]) assert_dist_and_unknowns(ddist[17], [[0], [3]]) assert_dist_and_unknowns(ddist[18], [[0, 2], [4, 1]]) assert_dist_and_unknowns(ddist[19], zeros) with d.unlocked(): d.set_weights(np.array([1, 2, 3, 4, 5])) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) assert_dist_and_unknowns(ddist[0], [15, 0, 0]) assert_dist_and_unknowns(ddist[1], [14, 0, 1]) assert_dist_and_unknowns(ddist[2], [8, 2, 5]) assert_dist_and_unknowns(ddist[3], [9, 5, 1]) assert_dist_and_unknowns(ddist[4], [12, 1, 0]) assert_dist_and_unknowns(ddist[5], [9, 1, 3]) assert_dist_and_unknowns(ddist[6], [4, 7, 1]) assert_dist_and_unknowns(ddist[7], [15, 0, 0]) assert_dist_and_unknowns(ddist[8], [13, 0, 2]) assert_dist_and_unknowns(ddist[9], [14, 1, 0]) zeros = [[0], [15]] assert_dist_and_unknowns(ddist[10], zeros) assert_dist_and_unknowns(ddist[11], zeros) assert_dist_and_unknowns(ddist[12], zeros) assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [9, 1, 5]]) assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [12, 1, 2]]) assert_dist_and_unknowns(ddist[15], zeros) assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [12, 2, 1]]) assert_dist_and_unknowns(ddist[17], [[0], [12]]) assert_dist_and_unknowns(ddist[18], [[0, 2], [14, 1]]) assert_dist_and_unknowns(ddist[19], zeros)
self.assertTrue(math.isnan(var.to_val("?"))) # TODO: with self.assertRaises(ValueError): var.to_val(2) with self.assertRaises(ValueError): var.to_val("G") PickleContinuousVariable = create_pickling_tests( "PickleContinuousVariable", ("variable", lambda: data.ContinuousVariable()), ("with_name", lambda: data.ContinuousVariable(name="Feature 0")), ) PickleDiscreteVariable = create_pickling_tests( "PickleDiscreteVariable", ("variable", lambda: data.DiscreteVariable()), ("with_name", lambda: data.DiscreteVariable(name="Feature 0")), ("with_int_values", lambda: data.DiscreteVariable(name="Feature 0", values=[1, 2, 3])), ("with_str_value", lambda: data.DiscreteVariable(name="Feature 0", values=["F", "M"])), ("ordered", lambda: data.DiscreteVariable( name="Feature 0", values=["F", "M"], ordered=True)), ("with_base_value", lambda: data.DiscreteVariable( name="Feature 0", values=["F", "M"], base_value=0)), ) PickleStringVariable = create_pickling_tests( "PickleStringVariable", ("variable", lambda: data.StringVariable()), ("with_name", lambda: data.StringVariable(name="Feature 0")),
class WikipediaAPI: """ Wraps Wikipedia API. Examples: >>> api = WikipediaAPI() >>> corpus = api.search('en', ['Barack Obama', 'Hillary Clinton']) """ metas = [ (data.StringVariable('题目'), lambda doc: getattr(doc, 'title')), (data.StringVariable('内容'), lambda doc: getattr(doc, 'content')), (data.StringVariable('摘要'), lambda doc: getattr(doc, 'summary')), (data.StringVariable('连接'), lambda doc: getattr(doc, 'url')), (data.ContinuousVariable('文章ID', number_of_decimals=0), lambda doc: int(getattr(doc, 'pageid'))), (data.ContinuousVariable('版本ID', number_of_decimals=0), lambda doc: int(getattr(doc, 'revision_id'))), (data.DiscreteVariable('Query'), lambda doc: getattr(doc, 'query')), ] attributes = [] class_vars = [] text_features = [m for m, _ in metas] string_attributes = [ m for m, _ in metas if isinstance(m, data.StringVariable) ] def __init__(self, on_error=None): super().__init__() self.on_error = on_error or (lambda x: x) def search(self, lang, queries, articles_per_query=10, should_break=None, on_progress=None): """ Searches for articles. Args: lang(str): A language code in ISO 639-1 format. queries(list of str): A list of queries. should_break (callback): Callback for breaking the computation before the end. If it evaluates to True, downloading is stopped and document downloaded till now are returned in a Corpus. on_progress (callable): Callback for progress bar. """ wikipedia.set_lang(lang) results = [] for i, query in enumerate(queries): try: articles = wikipedia.search(query, results=articles_per_query) for j, article in enumerate(articles): if callable(should_break) and should_break(): break results.extend(self._get(article, query, should_break)) if callable(on_progress): on_progress((i * articles_per_query + j + 1) / (len(queries) * articles_per_query), len(results)) except (wikipedia.exceptions.HTTPTimeoutError, IOError) as e: self.on_error(str(e)) break if callable(should_break) and should_break(): break return Corpus.from_documents(results, 'Wikipedia', self.attributes, self.class_vars, self.metas, title_indices=[-1]) def _get(self, article, query, should_break, recursive=True): try: article = wikipedia.page(article) article.query = query return [article] except wikipedia.exceptions.DisambiguationError: res = [] if recursive: for article in wikipedia.search(article, 10): if callable(should_break) and should_break(): break res.extend( self._get(article, query, should_break, recursive=False)) return res except wikipedia.exceptions.PageError: return []
def _construct_sparse(): domain = data.Domain( [ data.DiscreteVariable("d%i" % i, values=list("abc")) for i in range(10) ] + [data.ContinuousVariable("c%i" % i) for i in range(10)], data.DiscreteVariable("y", values=list("abc")), ) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 # ------------------------------------------------------------ # 2 2 1 1 2 1 1 1 2 0 2 # 1 1 0 0 1 2 2 1 0 # 1 2 0 # # 2 0 1 1.1 # sdata = np.array([ 2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 0, 1, 2, 0, 2, 0, 1, 1.1, ]) indices = [ 1, 3, 4, 5, 6, 9, 13, 14, 16, 17, 18, 2, 3, 4, 5, 6, 8, 14, 16, 17, 3, 5, 6, 2, 5, 6, 13, ] indptr = [0, 11, 20, 23, 23, 27] X = sp.csr_matrix((sdata, indices, indptr), shape=(5, 20)) Y = np.array([[1, 2, 1, 0, 0]]).T return data.Table.from_numpy(domain, X, Y)
class NYT: """ Class for fetching records from the NYT API. """ @staticmethod def keywords(doc, name): return ', '.join([kw.get('value') for kw in doc.get('keywords', []) if kw['name'] == name]) attributes = [] class_vars = [ (data.DiscreteVariable('Section'), lambda doc: doc.get('section_name', None)), ] tv = data.TimeVariable('Publication Date') metas = [ (data.StringVariable('Headline'), lambda doc: doc.get('headline', {}).get('main') or ''), (data.StringVariable('Abstract'), lambda doc: doc.get('abstract') or ''), (data.StringVariable('Snippet'), lambda doc: doc.get('snippet') or ''), (data.StringVariable('Lead Paragraph'), lambda doc: doc.get('lead_paragraph') or ''), (data.StringVariable('Subject Keywords'), lambda doc: NYT.keywords(doc, 'subject')), (data.StringVariable('URL'), lambda doc: doc.get('web_url') or ''), (data.StringVariable('Locations'), lambda doc: NYT.keywords(doc, 'glocations')), (data.StringVariable('Persons'), lambda doc: NYT.keywords(doc, 'persons')), (data.StringVariable('Organizations'), lambda doc: NYT.keywords(doc, 'organizations')), (data.StringVariable('Creative Works'), lambda doc: NYT.keywords(doc, 'creative_works')), (tv, lambda doc: NYT.tv.parse(doc.get('pub_date'))), (data.DiscreteVariable('Article Type'), lambda doc: doc.get('type_of_material', None)), (data.ContinuousVariable('Word Count', number_of_decimals=0), lambda doc: doc.get('word_count', None)), ] text_features = [metas[0][0], metas[1][0]] # headline + abstract def __init__(self, api_key): """ Args: api_key (str): NY Time API key. """ self.api_key = api_key self.on_error = None self.on_rate_limit = None self.on_no_connection = None self.cache_path = None self._cache_init() def api_key_valid(self): """ Checks whether api key given at initialization is valid. """ url = self._encode_url('test') try: with request.urlopen(url) as connection: if connection.getcode() == 200: return True except (HTTPError, URLError, HTTPException): return False def search(self, query, date_from=None, date_to=None, max_docs=None, on_progress=None, should_break=None): """ Args: query (str): Search query. date_from (date): Start date limit. date_to (date): End date limit. max_docs (int): Maximal number of documents returned. on_progress (callback): Called after every iteration of downloading. should_break (callback): Callback for breaking the computation before the end. If it evaluates to True, downloading is stopped and document downloaded till now are returned in a Corpus. Returns: Corpus: Search results. """ if max_docs is None or max_docs > MAX_DOCS: max_docs = MAX_DOCS # TODO create corpus on the fly and extend, so it stops faster. records = [] data, go_sleep = self._fetch_page(query, date_from, date_to, 0) if data is None: return None records.extend(data['response']['docs']) max_docs = min(data['response']['meta']['hits'], max_docs) if callable(on_progress): on_progress(len(records), max_docs) for page in range(1, math.ceil(max_docs/BATCH_SIZE)): if callable(should_break) and should_break(): break if go_sleep: sleep(SLEEP) data, go_sleep = self._fetch_page(query, date_from, date_to, page) if data is None: break records.extend(data['response']['docs']) if callable(on_progress): on_progress(len(records), max_docs) if len(records) > max_docs: records = records[:max_docs] return Corpus.from_documents(records, 'NY Times', self.attributes, self.class_vars, self.metas, title_indices=[-1]) def _cache_init(self): """ Initialize cache in Orange environment buffer dir. """ path = os.path.join(environ.cache_dir(), "nytcache") try: if not os.path.exists(path): os.makedirs(path) self.cache_path = os.path.join(path, "query_cache") except OSError as e: warnings.warn('Could not initialize NYT cache: {}'.format(str(e)), RuntimeWarning) def _cache_fetch(self, url): """ Fetch URL from cache if present. """ with shelve.open(self.cache_path) as cache: if url in cache.keys(): return cache[url] else: return None def _cache_store(self, url, data): """ Store data for URL in cache. """ with shelve.open(self.cache_path) as cache: cache[url] = data def _fetch_page(self, query, date_from, date_to, page): """ Fetch one page either from cache or web. """ cache_url = self._encode_url(query, date_from, date_to, page, for_caching=True) data = self._cache_fetch(cache_url) if data: return data, False else: url = self._encode_url(query, date_from, date_to, page, for_caching=False) try: with request.urlopen(url, timeout=TIMEOUT) as conn: data = conn.read().decode('utf-8') except HTTPError as e: if e.code == 403 and page > 0: # occasionally some pages return error 403 (Forbidden) # while all other page numbers seem to work just fine. # Skip such pages and don't break loading! warnings.warn('NYT api returned HTTPError with code 403 ' '(Forbidden)! Skipping this page ...') return {'response': {'docs': []}}, True if e.code == 429 and callable(self.on_rate_limit): self.on_rate_limit() elif callable(self.on_error): self.on_error(str(e)) return None, False except URLError: if callable(self.on_no_connection): self.on_no_connection() return None, False raise data = json.loads(data) self._cache_store(cache_url, data) return data, True def _encode_url(self, query, date_from=None, date_to=None, page=0, for_caching=False): """ Encode url for given query, date restrictions and page number. Args: query (str): Search query. date_from (date): Date restriction. date_to (date): Date restriction. page (int): Page number. for_caching (bool): Whether URL would be used for caching. If set, exclude BASE_URL and API key. Returns: str: An encoded URL. """ params = [ # list required to preserve order - important for caching ('fq', 'The New York Times'), ('api-key', self.api_key), ('q', query), ('page', page), ] if date_from: params.append(('begin_date', date_from.strftime('%Y%m%d'))) if date_to: params.append(('end_date', date_to.strftime('%Y%m%d'))) if for_caching: # remove api key, return only params del params[0] return parse.urlencode(params) else: return '{}?{}'.format(BASE_URL, parse.urlencode(params))
def test_sparse_get_distributions(self): domain = data.Domain( [data.DiscreteVariable("d%i" % i, values=list("abc")) for i in range(10)] + [data.ContinuousVariable("c%i" % i) for i in range(10)]) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 # ------------------------------------------------------------ # 2 2 1 1 2 1 1 1 2 0 2 # 1 1 0 0 1 2 2 1 0 # 1 2 0 # # 2 0 1 1.1 # sdata = np.array([2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 0, 1, 2, 0, 2, 0, 1, 1.1]) indices = [1, 3, 4, 5, 6, 9, 13, 14, 16, 17, 18, 2, 3, 4, 5, 6, 8, 14, 16, 17, 3, 5, 6, 2, 5, 6, 13] indptr = [0, 11, 20, 23, 23, 27] X = sp.csr_matrix((sdata, indices, indptr), shape=(5, 20)) d = data.Table.from_numpy(domain, X) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) np.testing.assert_almost_equal(ddist[0], [0, 0, 0]) np.testing.assert_almost_equal(ddist[1], [0, 0, 1]) np.testing.assert_almost_equal(ddist[2], [0, 1, 1]) np.testing.assert_almost_equal(ddist[3], [0, 2, 1]) np.testing.assert_almost_equal(ddist[4], [1, 1, 0]) np.testing.assert_almost_equal(ddist[5], [2, 1, 1]) np.testing.assert_almost_equal(ddist[6], [1, 2, 1]) np.testing.assert_almost_equal(ddist[7], [0, 0, 0]) np.testing.assert_almost_equal(ddist[8], [0, 0, 1]) np.testing.assert_almost_equal(ddist[9], [0, 1, 0]) z = np.zeros((2, 0)) np.testing.assert_almost_equal(ddist[10], z) np.testing.assert_almost_equal(ddist[11], z) np.testing.assert_almost_equal(ddist[12], z) np.testing.assert_almost_equal(ddist[13], [[1, 1.1], [1, 1]]) np.testing.assert_almost_equal(ddist[14], [[1, 2], [1, 1]]) np.testing.assert_almost_equal(ddist[15], z) np.testing.assert_almost_equal(ddist[16], [[1, 2], [1, 1]]) np.testing.assert_almost_equal(ddist[17], [[0], [2]]) np.testing.assert_almost_equal(ddist[18], [[2], [1]]) np.testing.assert_almost_equal(ddist[19], z) d.set_weights(np.array([1, 2, 3, 4, 5])) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) np.testing.assert_almost_equal(ddist[0], [0, 0, 0]) np.testing.assert_almost_equal(ddist[1], [0, 0, 1]) np.testing.assert_almost_equal(ddist[2], [0, 2, 5]) np.testing.assert_almost_equal(ddist[3], [0, 5, 1]) np.testing.assert_almost_equal(ddist[4], [2, 1, 0]) np.testing.assert_almost_equal(ddist[5], [7, 1, 3]) np.testing.assert_almost_equal(ddist[6], [3, 7, 1]) np.testing.assert_almost_equal(ddist[7], [0, 0, 0]) np.testing.assert_almost_equal(ddist[8], [0, 0, 2]) np.testing.assert_almost_equal(ddist[9], [0, 1, 0]) z = np.zeros((2, 0)) np.testing.assert_almost_equal(ddist[10], z) np.testing.assert_almost_equal(ddist[11], z) np.testing.assert_almost_equal(ddist[12], z) np.testing.assert_almost_equal(ddist[13], [[1, 1.1], [1, 5]]) np.testing.assert_almost_equal(ddist[14], [[1, 2], [1, 2]]) np.testing.assert_almost_equal(ddist[15], z) np.testing.assert_almost_equal(ddist[16], [[1, 2], [2, 1]]) np.testing.assert_almost_equal(ddist[17], [[0], [3]]) np.testing.assert_almost_equal(ddist[18], [[2], [1]]) np.testing.assert_almost_equal(ddist[19], z)
import unittest from Orange.testing import create_pickling_tests from Orange import data age = data.ContinuousVariable(name="AGE") gender = data.DiscreteVariable(name="Gender", values=["M", "F"]) incomeA = data.ContinuousVariable(name="AGE") income = data.ContinuousVariable(name="income") education = data.DiscreteVariable(name="education", values=["GS", "HS", "C"]) ssn = data.StringVariable(name="SSN") race = data.DiscreteVariable(name="race", values=["White", "Hypsanic", "African", "Other"]) PickleDomain = create_pickling_tests( "PickleDomain", ("empty_domain", lambda: data.Domain([])), ("with_continuous_variable", lambda: data.Domain([age])), ("with_discrete_variable", lambda: data.Domain([gender])), ("with_mixed_variables", lambda: data.Domain([age, gender])), ("with_continuous_class", lambda: data.Domain([age, gender], [incomeA])), ("with_discrete_class", lambda: data.Domain([age, gender], [education])), ("with_multiple_classes", lambda: data.Domain([age, gender], [incomeA, education])), ("with_metas", lambda: data.Domain([age, gender], metas=[ssn])), ("with_class_and_metas", lambda: data.Domain([age, gender], [incomeA, education],
class TheGuardianAPI: attributes = [] class_vars = [ (data.DiscreteVariable('Section'), lambda doc: doc['sectionName']), ] tv = data.TimeVariable('Publication Date') metas = [ (data.StringVariable('Headline'), lambda doc: doc['fields']['headline']), (data.StringVariable('Content'), lambda doc: doc['fields']['bodyText']), (data.StringVariable('Trail Text'), lambda doc: doc['fields']['trailText']), (data.StringVariable('HTML'), lambda doc: doc['fields']['body']), (tv, lambda doc: TheGuardianAPI.tv.parse(doc['webPublicationDate'])), (data.DiscreteVariable('Type'), lambda doc: doc['type']), (data.DiscreteVariable('Language'), lambda doc: doc['fields']['lang']), (data.StringVariable('Tags'), lambda doc: ', '.join(tag['webTitle'] for tag in doc['tags'])), (data.StringVariable('URL'), lambda doc: doc['webUrl']), (data.ContinuousVariable('Word Count', number_of_decimals=0), lambda doc: doc['fields']['wordcount']), ] text_features = [metas[0][0], metas[1][0]] # Headline + Content title_indices = [-1] # Headline def __init__(self, credentials, on_progress=None, should_break=None): """ Args: credentials (:class:`TheGuardianCredentials`): The Guardian Creentials. on_progress (callable): Function for progress reporting. should_break (callable): Function for early stopping. """ self.per_page = ARTICLES_PER_PAGE self.pages = 0 self.credentials = credentials self.on_progress = on_progress or (lambda x, y: None) self.should_break = should_break or (lambda: False) self.results = [] def _search(self, query, from_date, to_date, page=1): data = self._build_query(query, from_date, to_date, page) response = requests.get(BASE_URL, data) parsed = json.loads(response.text) if page == 1: # store number of pages self.pages = parsed['response']['pages'] self.results.extend(parsed['response']['results']) def _build_query(self, query, from_date=None, to_date=None, page=1): data = { 'q': query, 'api-key': self.credentials.key, 'page': str(page), 'show-fields': 'headline,trailText,body,bodyText,lang,wordcount', 'show-tags': 'all', } if from_date is not None: data['from-date'] = from_date if to_date is not None: data['to-date'] = to_date return data def search(self, query, from_date=None, to_date=None, max_documents=None, accumulate=False): """ Search The Guardian API for articles. Args: query (str): A query for searching the articles by from_date (str): Search only articles newer than the date provided. Date should be in ISO format; e.g. '2016-12-31'. to_date (str): Search only articles older than the date provided. Date should be in ISO format; e.g. '2016-12-31'. max_documents (int): Maximum number of documents to retrieve. When not given, retrieve all documents. accumulate (bool): A flag indicating whether to accumulate results of multiple consequent search calls. Returns: :ref:`Corpus` """ if not accumulate: self.results = [] self._search(query, from_date, to_date) pages = math.ceil(max_documents/self.per_page) if max_documents else self.pages self.on_progress(self.per_page, pages * self.per_page) for p in range(2, pages+1): # to one based if self.should_break(): break self._search(query, from_date, to_date, p) self.on_progress(p*self.per_page, pages * self.per_page) c = Corpus.from_documents( self.results, 'The Guardian', self.attributes, self.class_vars, self.metas, title_indices=self.title_indices) c.text_features = self.text_features return c
class TwitterAPI: """ Fetch tweets from the Tweeter API. Notes: Results across multiple searches are aggregated. To remove tweets form previous searches and only return results from the last search either call `reset` method before searching or provide `collecting=False` argument to search method. """ attributes = [] class_vars = [ (data.DiscreteVariable('Author'), lambda doc: '@' + doc.author.screen_name), ] tv = data.TimeVariable('Date') metas = [ (data.StringVariable('Content'), lambda doc: doc.text), (tv, lambda doc: TwitterAPI.tv.parse(doc.created_at.isoformat())), (data.DiscreteVariable('Language'), lambda doc: doc.lang), (data.DiscreteVariable('Location'), lambda doc: getattr(doc.place, 'country_code', None)), (data.ContinuousVariable('Number of Likes', number_of_decimals=0), lambda doc: doc.favorite_count), (data.ContinuousVariable('Number of Retweets', number_of_decimals=0), lambda doc: doc.retweet_count), (data.DiscreteVariable('In Reply To'), lambda doc: '@' + doc.in_reply_to_screen_name if doc.in_reply_to_screen_name else ''), (data.DiscreteVariable('Author Name'), lambda doc: doc.author.name), (data.StringVariable('Author Description'), lambda doc: doc.author.description), (data.ContinuousVariable('Author Statuses Count', number_of_decimals=0), lambda doc: doc.author.statuses_count), (data.ContinuousVariable('Author Favourites Count', number_of_decimals=0), lambda doc: doc.author.favourites_count), (data.ContinuousVariable('Author Friends Count', number_of_decimals=0), lambda doc: doc.author.friends_count), (data.ContinuousVariable('Author Followers Count', number_of_decimals=0), lambda doc: doc.author.followers_count), (data.ContinuousVariable('Author Listed Count', number_of_decimals=0), lambda doc: doc.author.listed_count), (data.DiscreteVariable('Author Verified'), lambda doc: str(doc.author.verified)), (data.ContinuousVariable('Longitude'), lambda doc: coordinates_geoJSON(doc.coordinates)[0]), (data.ContinuousVariable('Latitude'), lambda doc: coordinates_geoJSON(doc.coordinates)[1]), ] text_features = [metas[0][0]] # Content string_attributes = [m for m, _ in metas if isinstance(m, data.StringVariable)] def __init__(self, credentials, on_progress=None, should_break=None, on_error=None, on_rate_limit=None): self.key = credentials self.api = tweepy.API(credentials.auth) self.container = OrderedDict() self.search_history = [] # Callbacks: self.on_error = on_error self.on_rate_limit = on_rate_limit self.on_progress = on_progress or (lambda *args: args) self.should_break = should_break or (lambda *args: False) @property def tweets(self): return self.container.values() def search_content(self, content, *, max_tweets=0, lang=None, allow_retweets=True, collecting=False): """ Search by content. Args: content (list of str): A list of key words to search for. max_tweets (int): If greater than zero limits the number of downloaded tweets. lang (str): A language's code (either ISO 639-1 or ISO 639-3 formats). allow_retweets(bool): Whether to download retweets. collecting (bool): Whether to collect results across multiple search calls. Returns: Corpus """ if not collecting: self.reset() if max_tweets == 0: max_tweets = float('Inf') def build_query(): nonlocal content if not content: q = 'from: ' else: if not isinstance(content, list): content = [content] q = ' OR '.join(['"{}"'.format(q) for q in content]) if not allow_retweets: q += ' -filter:retweets' return q query = build_query() cursor = tweepy.Cursor(self.api.search, q=query, lang=lang) corpus, count = self.fetch(cursor, max_tweets) self.append_history('Content', content, lang if lang else 'Any', str(allow_retweets), count) return corpus def search_authors(self, authors, *, max_tweets=0, collecting=False): """ Search by authors. Args: authors (list of str): A list of authors to search for. max_tweets (int): If greater than zero limits the number of downloaded tweets. collecting (bool): Whether to collect results across multiple search calls. Returns: Corpus """ if not collecting: self.reset() if max_tweets == 0: # set to max allowed for progress max_tweets = 3200 if not isinstance(authors, list): authors = [authors] cursors = [tweepy.Cursor(self.api.user_timeline, screen_name=a) for a in authors] corpus, count = self.fetch(cursors, max_tweets) self.append_history('Author', authors, None, None, count) return corpus def fetch(self, cursors, max_tweets): if not isinstance(cursors, list): cursors = [cursors] count = 0 try: for i, cursor in enumerate(cursors): for j, tweet in enumerate(cursor.items(max_tweets), start=1): if self.should_break(): break if tweet.id not in self.container: count += 1 self.container[tweet.id] = tweet if j % 20 == 0: self.on_progress(len(self.container), (i*max_tweets + j)/ (len(cursors)*max_tweets)) if self.should_break(): break except tweepy.TweepError as e: if e.response.status_code == 429 and self.on_rate_limit: self.on_rate_limit() elif self.on_error: self.on_error(str(e)) return None, 0 return self.create_corpus(), count def create_corpus(self): return Corpus.from_documents(self.tweets, 'Twitter', self.attributes, self.class_vars, self.metas, title_indices=[-1]) def reset(self): """ Removes all downloaded tweets. """ self.search_history = [] self.container = OrderedDict() def append_history(self, mode, query, lang, allow_retweets, n_tweets): query = ', '.join(query) if isinstance(query, Iterable) else query if lang in code2lang.keys(): lang = code2lang[lang] self.search_history.append(( ('Query', query), ('Search by', mode), ('Language', lang), ('Allow retweets', allow_retweets), ('Tweets count', n_tweets), )) def report(self): return self.search_history
class FacebookOrangeAPI(): attributes = [] class_vars = [] image_var = data.StringVariable.make("image") image_var.attributes["type"] = "image" post_metas = [ (data.StringVariable('Message'), lambda doc: doc['status_message']), (data.DiscreteVariable('From'), lambda doc: doc['from_name']), (data.ContinuousVariable('likes'), lambda doc: doc['like']), (data.ContinuousVariable('comments'), lambda doc: doc['comments']), (data.ContinuousVariable('shares'), lambda doc: doc['shares']), (data.DiscreteVariable('top emotion'), lambda doc: doc['top_reaction']), (data.StringVariable('Link name'), lambda doc: doc['link_name']), (image_var, lambda doc: doc['picture']), (data.StringVariable('link'), lambda doc: doc['status_link']), (data.DiscreteVariable('From ID'), lambda doc: doc['from_id']), (data.StringVariable('Post ID'), lambda doc: doc['status_id']), (data.DiscreteVariable('Post type'), lambda doc: doc['status_type']), (data.TimeVariable('Publication Date'), lambda doc: doc['status_published']), (data.TimeVariable('Publication Date UTC'), lambda doc: doc['status_published_utc']), (data.ContinuousVariable('emotion angry'), lambda doc: doc['angry']), (data.ContinuousVariable('emotion love'), lambda doc: doc['love']), (data.ContinuousVariable('emotion haha'), lambda doc: doc['haha']), (data.ContinuousVariable('emotion wow'), lambda doc: doc['wow']), (data.ContinuousVariable('emotion sad'), lambda doc: doc['sad']) ] text_features = [post_metas[0][0]] title_indices = [-1] def __init__(self, credentials, on_progress=None, should_break=None): self.utc_datecor = datetime.utcnow() - datetime.now() self.pages = 0 self.credentials = credentials self.on_progress = on_progress or (lambda x, y: None) self.should_break = should_break or (lambda: False) def buildUrl(self, node, version='v2.11'): return BASE_URL + '/' + version + '/' + node def getData(self, url, params=None): while True: if self.should_break(): return {} try: headers = {'Authorization': 'Bearer ' + self.credentials.token} p = requests.get(url, params=params, headers=headers) return p.json() except: print('retry in 5 sec') for i in range(50): if self.should_break(): return {} time.sleep(0.1) def localToUtc(self, date): return date + self.utc_datecor def utcToLocal(self, date): return date - self.utc_datecor def processDate(self, created_time): return datetime.strptime(created_time, '%Y-%m-%dT%H:%M:%S+0000') def processStatus(self, status, engagement=True): d = {} d['status_id'] = status['id'] d['from_id'] = status['from']['id'] if 'from' in status.keys() else '' d['from_name'] = status['from']['name'] if 'from' in status.keys( ) else '' d['status_message'] = '' if 'message' not in status.keys( ) else status['message'] d['status_type'] = status['type'] d['link_name'] = '' if 'name' not in status.keys() else status['name'] d['status_published_utc'] = self.processDate(status['created_time']) d['status_published'] = self.utcToLocal(d['status_published_utc']) d['status_link'] = '' if 'link' not in status.keys( ) else status['link'] d['picture'] = status['full_picture'] if 'full_picture' in status.keys( ) else '' topscore = 0 d['like'] = status['like']['summary'][ 'total_count'] if engagement else '' d['comments'] = status['comments']['summary'][ 'total_count'] if engagement else '' d['shares'] = status['shares']['count'] if 'shares' in status.keys( ) else '' d['top_reaction'] = '' for score in ['love', 'haha', 'wow', 'sad', 'angry']: d[score] = status[score]['summary'][ 'total_count'] if engagement else '' if engagement: d[score] = status[score]['summary']['total_count'] if int(d[score]) > topscore: topscore = int(d[score]) d['top_reaction'] = score else: d[score] = '' d['top_reaction'] = '' return d def fieldString(self, engagement=True): field_string = 'message,from,link,created_time,type,name,id,full_picture' if engagement: field_string += ',' + 'comments.limit(0).summary(true),shares.limit(0).summary(true)' for r in ['like', 'love', 'haha', 'wow', 'sad', 'angry']: field_string += ',' + 'reactions.type({}).limit(0).summary(true).as({})'.format( r.upper(), r.lower()) return field_string def getStatuses(self, page_id, mode='posts', since=None, until=None, engagement=True, comments=True): node = page_id + '/' + mode + '/' ## mode can be "posts" (posts by page), "feed" (all posts on page) and "tagged" (all public posts in which page is tagged url = self.buildUrl(node) params = {} params['fields'] = self.fieldString(engagement) params['limit'] = 100 if since is not None: params['since'] = ( self.localToUtc(since)).strftime('%Y-%m-%dT%H:%M:%S') if until is not None: params['until'] = ( self.localToUtc(until)).strftime('%Y-%m-%dT%H:%M:%S') while True: statuses = self.getData(url, params=params) if not 'data' in statuses: break proc_statuses = [ self.processStatus(s, engagement) for s in statuses['data'] ] yield proc_statuses if not 'paging' in statuses.keys(): break if not 'next' in statuses['paging'].keys(): break url = statuses['paging']['next'] def _search(self, page_ids, mode, since, until, max_documents, sub_progress=(0, 1)): since = since.strftime('%Y-%m-%d') until = until.strftime('%Y-%m-%d') since = datetime.strptime(since, '%Y-%m-%d') until = datetime.strptime(until + 'T23:59:59', '%Y-%m-%dT%H:%M:%S') total_sec = float((until - since).total_seconds()) n_pages = len(page_ids) progress_pct = 1 / float(n_pages) for page_i in range(0, n_pages): page_id = page_ids[page_i].strip() if page_id == '': return if '/' in page_id: page_id = page_id.split('/')[-1] page_progress = progress_pct * page_i n = 0 for d in self.getStatuses(page_id, mode, since, until): if self.should_break(): return earliest_date = d[-1]['status_published'] sec_to_go = (until - earliest_date).total_seconds() date_progress = ((sec_to_go / total_sec) * progress_pct) progress = math.ceil((page_progress + date_progress) * 100) self.on_progress(progress_scale(progress, sub_progress), 100) for doc in d: n += 1 if max_documents is not None: if n > max_documents: break yield doc if max_documents is not None: if n > max_documents: break self.on_progress(progress_scale(100, sub_progress), 100) def search(self, page_ids, mode='posts', since=datetime.now() - timedelta(10), until=datetime.now(), max_documents=None, sub_progress=(0, 1)): results = [] for doc in self._search(page_ids, mode, since, until, max_documents, sub_progress): doc['status_published'] = doc['status_published'].strftime( '%Y-%m-%dT%H:%M:%S') doc['status_published_utc'] = doc['status_published_utc'].strftime( '%Y-%m-%dT%H:%M:%S') results.append(doc) c = Corpus.from_documents(results, 'Facebook', self.attributes, self.class_vars, self.post_metas, self.title_indices) c.set_text_features(self.text_features) return c def _search_posts(self, post_ids, sub_progress=(0, 1), engagement=True): for i, post_id in enumerate(post_ids): node = post_id url = self.buildUrl(node) params = {} params['fields'] = self.fieldString(engagement) params['limit'] = 100 status = self.getData(url, params=params) status = self.processStatus(status) yield status progress = ((i + 1) / len(post_ids)) * 100 self.on_progress(progress_scale(progress, sub_progress), 100) self.on_progress(progress_scale(100, sub_progress), 100) def search_posts(self, post_ids, sub_progress=(0, 1)): results = [] for doc in self._search_posts(post_ids, sub_progress): doc['status_published'] = doc['status_published'].strftime( '%Y-%m-%dT%H:%M:%S') doc['status_published_utc'] = doc['status_published_utc'].strftime( '%Y-%m-%dT%H:%M:%S') results.append(doc) c = Corpus.from_documents(results, 'Facebook', self.attributes, self.class_vars, self.post_metas, self.title_indices) c.set_text_features(self.text_features) return c def processComment(self, comment): has_comment_replies = 'comments' in comment.keys() parent = { 'type': 'comment', 'comment_id': comment['id'], 'likes': comment['like']['summary']['total_count'], 'comment_replies': None, 'message': comment['message'], 'parent_comment_id': '' } parent['status_published_utc'] = self.processDate( comment['created_time']) parent['status_published'] = self.utcToLocal( parent['status_published_utc']) if has_comment_replies: parent['comment_replies'] = comment['comments']['summary'][ 'total_count'] yield parent if has_comment_replies: comment_replies = comment['comments'] while True: for cr in comment_replies['data']: child = { 'type': 'comment_reply', 'comment_id': comment['id'], 'likes': cr['like']['summary']['total_count'], 'message': cr['message'], 'parent_comment_id': cr['id'], 'comment_replies': None } child['status_published_utc'] = self.processDate( cr['created_time']) child['status_published'] = self.utcToLocal( child['status_published_utc']) yield child if not 'paging' in comment_replies.keys(): break if not 'next' in comment_replies['paging'].keys(): break url = comment_replies['paging']['next'] comment_replies = self.getData(url) def _getComments(self, post_ids, comment_replies=True, sub_progress=(0, 1)): for i, post_id in enumerate(post_ids): node = post_id + '/comments' url = self.buildUrl(node) params = {} params[ 'fields'] = 'message,created_time,reactions.type(LIKE).summary(true).as(like)' if comment_replies: params[ 'fields'] += ',comments.summary(true){message,created_time,reactions.type(LIKE).summary(true).as(like)}' params['limit'] = 100 while True: comments = self.getData(url, params=params) if len(comments['data']) == 0: break for comment in comments['data']: for proc_comment in self.processComment(comment): proc_comment['post_id'] = post_id yield proc_comment if not 'paging' in comments.keys(): break if not 'next' in comments['paging'].keys(): break url = comments['paging']['next'] progress = ((i + 1) / len(post_ids)) * 100 self.on_progress(progress_scale(progress, sub_progress), 100) self.on_progress(progress_scale(100, sub_progress), 100) def getComments(self, post_ids, comment_replies=True, sub_progress=(0, 1)): attributes = [] class_vars = [] metas = [(data.StringVariable('Message'), lambda doc: doc['message']), (data.DiscreteVariable('Type'), lambda doc: doc['type']), (data.StringVariable('Post ID'), lambda doc: doc['post_id']), (data.StringVariable('Comment ID'), lambda doc: doc['comment_id']), (data.StringVariable('Parent comment ID'), lambda doc: doc['parent_comment_id']), (data.ContinuousVariable('likes'), lambda doc: doc['likes']), (data.ContinuousVariable('comment replies'), lambda doc: doc['comment_replies']), (data.TimeVariable('Publication Date'), lambda doc: doc['status_published']), (data.TimeVariable('Publication Date UTC'), lambda doc: doc['status_published_utc'])] text_features = [metas[0][0]] title_indices = [-1] results = [] for doc in self._getComments(post_ids, comment_replies, sub_progress): doc['status_published'] = doc['status_published'].strftime( '%Y-%m-%dT%H:%M:%S') doc['status_published_utc'] = doc['status_published_utc'].strftime( '%Y-%m-%dT%H:%M:%S') results.append(doc) c = Corpus.from_documents(results, 'Facebook comments', attributes, class_vars, metas, title_indices) c.set_text_features(text_features) return c