def _content_items(): content_query = """ SELECT '{}' AS source_id, id FROM content WHERE (url in ({}) or id in ({})) AND org_id = {} """ queries = [] for source_id, vals in meta.iteritems(): links = ",".join( ["'%s'" % l for l in uniq(meta[source_id].pop('links', []))]) ids = ",".join( [str(i) for i in uniq(meta[source_id].pop('content_item_ids', []))]) if len(links) or len(ids): # THIS IS KIND OF A HACK FOR NOW. if not links: links = "'__null___'" if not ids: ids = '-99999' queries.append( content_query.format(source_id, links, ids, org_id)) # execute query + modify meta. if len(queries): q = " UNION ALL ".join(queries) for row in ResultIter(db.session.execute(q)): src_id = row['source_id'] k = 'content_item_ids' if k not in meta[src_id]: meta[src_id][k] = [] meta[src_id][k].append(row['id']) db.session.commit() db.session.close() db.session.remove()
def queries(self): """ Programmatically generate search queries based on a org's domains """ domains = self.org.get('domains', []) domains.extend(self.settings.get('short_urls', [])) domains.extend(self.settings.get('short_domains', [])) domains = uniq(domains) _queries = [] for d in domains: term = d.replace(".", " ").strip().lower() q = '"{}" filter:links'.format(term) _queries.append(q) if not len(_queries): raise RequestError('This Org has no domains.') return uniq(_queries)
def from_html(htmlstring, source=None): """ Extract all img urls from an html string """ if not htmlstring: return [] soup = BeautifulSoup(htmlstring) out_imgs = [] for tag, attr in IMG_TAGS: for el in soup.find_all(tag): img_url = el.attrs.get(attr) if not img_url: continue # only take images with known formats fmt = url.is_image(img_url) if not fmt: continue # absolutify images if we know their source. if img_url.startswith('/') or not img_url.startswith('http'): if source: img_url = urljoin(source, img_url) else: continue out_imgs.append(img_url) return uniq(out_imgs)
def meta(self): d = { 'followers': self._user.get('followers_count'), 'friends': self._user.get('friends_count'), 'hashtags': uniq([h['text'] for h in self._entities.get('hashtags', [])]) } if self.incl_embed: d['embed'] = self.embed return d
def get_authors(self, entry): """ return all candidates, and parse unique """ authors = [] candidates = self.get_candidates(entry, AUTHOR_CANDIDATE_JSONPATH) for c in candidates: for a in author.parse(c): authors.append(a) return uniq(authors)
def _content_items(): content_query = """ SELECT '{}' AS source_id, id FROM content WHERE (url in ({}) or id in ({})) AND org_id = {} """ queries = [] for source_id, vals in meta.iteritems(): links = ",".join( ["'%s'" % l for l in uniq(meta[source_id].pop('links', []))]) ids = ",".join([ str(i) for i in uniq(meta[source_id].pop('content_item_ids', [])) ]) if len(links) or len(ids): # THIS IS KIND OF A HACK FOR NOW. if not links: links = "'__null___'" if not ids: ids = '-99999' queries.append( content_query.format(source_id, links, ids, org_id)) # execute query + modify meta. if len(queries): q = " UNION ALL ".join(queries) for row in ResultIter(db.session.execute(q)): src_id = row['source_id'] k = 'content_item_ids' if k not in meta[src_id]: meta[src_id][k] = [] meta[src_id][k].append(row['id']) db.session.commit() db.session.close() db.session.remove()
def get_candidates(self, obj, jsonpaths): """ evaluate an object with jsonpaths, and get all unique vals / lists of values """ candidates = [] for path in jsonpaths: path_candidates = self.get_jsonpath(obj, path) if isinstance(path_candidates, list): for candidate in path_candidates: if candidate: candidates.append(candidate) elif isinstance(path_candidates, str): candidates.append(candidate) return uniq(candidates)
def links(self): """ Extract all links """ urls = [] if self.post.get('link'): urls.append(self.post['link']) if self.post.get('source'): urls.append(self.post['source']) if self.post.get('message'): msg_urls = url.from_string(self.post['message']) urls.extend(msg_urls) if self.post.get('descrption'): desc_urls = url.from_string(self.post['message']) urls.extend(desc_urls) return uniq(urls)
def _tags(): tag_query = """ SELECT '{0}' AS uniqkey, id FROM tags WHERE (slug in ({1}) or id in ({2})) AND org_id = {3} AND type='subject' """ queries = [] for uniqkey, vals in meta.iteritems(): # separate slugs and ids. tags = uniq(meta[uniqkey].pop('tag_ids', [])) ids = [] slugs = [] for t in tags: try: ids.append(int(t)) except ValueError: slugs.append(t) # format queries. slugs = ",".join(["'%s'" % s for s in slugs]) ids = ",".join([str(i) for i in ids]) if len(slugs) or len(ids): if not slugs: slugs = "'__null___'" if not ids: ids = '-99999' queries.append(tag_query.format(uniqkey, slugs, ids, org_id)) # execute query + modify meta. if len(queries): q = "\nUNION ALL\n".join(queries) for row in ResultIter(db.session.execute(q)): id = row['uniqkey'] k = 'tag_ids' if k not in meta[id]: meta[id][k] = [] meta[id][k].append(row['id']) db.session.commit() db.session.close() db.session.remove()
def from_string(string, **kw): """ get urls from input string """ source = kw.get('source', None) exclude_images = kw.get('excl_img', True) if not string: return [] raw_urls = re_url.findall(string) short_urls = [g[0].strip() for g in re_short_url_text.findall(string)] urls = [] if source: for url in raw_urls: if not is_abs(url): url = urljoin(source, url) urls.append(url) else: urls = [u for u in raw_urls if is_valid(u)] # make sure short url regex doesn't create partial dupes. for u in short_urls: if any([u in r for r in urls]): short_urls.remove(u) # combine urls += short_urls # remove images. if exclude_images: urls = [u for u in urls if not is_image(u)] # remove invalid urls urls = [u for u in urls if is_valid(u)] return uniq(urls)
def from_html(htmlstring, **kw): """ Extract urls from htmlstring, optionally reconciling relative urls + embeds + redirects. """ source = kw.get('source', None) exclude_images = kw.get('excl_img', True) if not htmlstring: return [] final_urls = [] if source: source_domain = get_domain(source) soup = BeautifulSoup(htmlstring) for tag in URL_TAGS: for el in soup.find_all(tag): for attr in URL_ATTRS: href = el.attrs.get(attr, None) if not href: continue url = reconcile_embed(href) if source: url = redirect_back(url, source_domain) if not is_abs(url): url = urljoin(source, url) if not is_valid(url): continue if exclude_images: if not is_image(url): final_urls.append(url) else: final_urls.append(url) return uniq(final_urls)
def _authors(): author_query = """ SELECT '{0}' AS uniqkey, id FROM authors WHERE (name in ({1}) or id in ({2})) AND org_id = {3} """ queries = [] for uniqkey, vals in meta.iteritems(): # separate slugs and ids. authors = meta[uniqkey].get('author_ids', []) ids = [] names = [] for a in authors: try: ids.append(int(a)) except ValueError: names.append(a.upper().strip()) names = ",".join(["'%s'" % n for n in uniq(names)]) ids = ",".join([str(i) for i in uniq(ids)]) if names or ids: if not names: names = "'__null___'" if not ids: ids = '-99999' queries.append( author_query.format(uniqkey, names, ids, org_id)) # execute query + modify meta. if len(queries): q = "\nUNION ALL\n".join(queries) for row in ResultIter(db.session.execute(q)): id = row['uniqkey'] k = 'author_ids' if k in meta[id]: meta[id][k] = [] meta[id][k].append(row['id']) meta[id]['authors_exist'] = True # check for authors we should create. to_create = [] for uniqkey, item in meta.iteritems(): if item.get('authors_exist', False): continue for a in meta[uniqkey].pop('author_ids', []): if not isinstance(a, (basestring, str, unicode)): continue to_create.append((uniqkey, org_id, a)) # if we should create them, do so. if len(to_create): # create authors + keep track of content relations authors_to_ids = dict() seen = set() for uniqkey, oid, name in to_create: name = name.upper().strip() if name not in seen and name.lower().strip() not in author.BAD_TOKENS: authors_to_ids[name] = {} seen.add(name.upper().strip()) a = Author(org_id=oid, name=name) db.session.add(a) authors_to_ids[name]['obj'] = a # keep track of ALL ids assoicated with this author. if name in authors_to_ids: if not 'ids' in authors_to_ids[name]: authors_to_ids[name]['ids'] = [] authors_to_ids[name]['ids'].append(uniqkey) # create new authors so we # can access their IDs. db.session.commit() # set author ids back on content item meta for name, values in authors_to_ids.iteritems(): ids = values.get('ids', []) obj = values.get('obj') k = 'author_ids' for uniqkey in ids: if k not in meta[uniqkey]: meta[uniqkey][k] = [] meta[uniqkey][k].append(obj.id) db.session.close() db.session.remove()
def tokenizer(text, n): """ Tokenize unique ngrams. """ grams = ngrams(text, n) return uniq([" ".join(gram).decode('utf-8') for gram in grams])
def required_metrics(f): """ What metrics does this formula require? """ return uniq(re_formula_metric_names.findall(f))
def get_tags(self, entry): """ Get all tags. """ tags = self.get_candidates(entry, TAG_CANDIDATE_JSONPATH) return uniq([t.upper() for t in tags if t and t.strip() != ""])
def tokenizer(text, n): """ Tokenize unique ngrams. """ grams = ngrams(text, n) return uniq([" ".join(gram).decode("utf-8") for gram in grams])
def _authors(): author_query = """ SELECT '{0}' AS uniqkey, id FROM authors WHERE (name in ({1}) or id in ({2})) AND org_id = {3} """ queries = [] for uniqkey, vals in meta.iteritems(): # separate slugs and ids. authors = meta[uniqkey].get('author_ids', []) ids = [] names = [] for a in authors: try: ids.append(int(a)) except ValueError: names.append(a.upper().strip()) names = ",".join(["'%s'" % n for n in uniq(names)]) ids = ",".join([str(i) for i in uniq(ids)]) if names or ids: if not names: names = "'__null___'" if not ids: ids = '-99999' queries.append(author_query.format(uniqkey, names, ids, org_id)) # execute query + modify meta. if len(queries): q = "\nUNION ALL\n".join(queries) for row in ResultIter(db.session.execute(q)): id = row['uniqkey'] k = 'author_ids' if k in meta[id]: meta[id][k] = [] meta[id][k].append(row['id']) meta[id]['authors_exist'] = True # check for authors we should create. to_create = [] for uniqkey, item in meta.iteritems(): if item.get('authors_exist', False): continue for a in meta[uniqkey].pop('author_ids', []): if not isinstance(a, (basestring, str, unicode)): continue to_create.append((uniqkey, org_id, a)) # if we should create them, do so. if len(to_create): # create authors + keep track of content relations authors_to_ids = dict() seen = set() for uniqkey, oid, name in to_create: name = name.upper().strip() if name not in seen and name.lower().strip( ) not in author.BAD_TOKENS: authors_to_ids[name] = {} seen.add(name.upper().strip()) a = Author(org_id=oid, name=name) db.session.add(a) authors_to_ids[name]['obj'] = a # keep track of ALL ids assoicated with this author. if name in authors_to_ids: if not 'ids' in authors_to_ids[name]: authors_to_ids[name]['ids'] = [] authors_to_ids[name]['ids'].append(uniqkey) # create new authors so we # can access their IDs. db.session.commit() # set author ids back on content item meta for name, values in authors_to_ids.iteritems(): ids = values.get('ids', []) obj = values.get('obj') k = 'author_ids' for uniqkey in ids: if k not in meta[uniqkey]: meta[uniqkey][k] = [] meta[uniqkey][k].append(obj.id) db.session.close() db.session.remove()
def links(self): return uniq([u['expanded_url'] for u in self._entities.get('urls', [])])
def img_url(self): media = uniq([h['media_url'] for h in self._entities.get('media', [])]) if len(media): return media[0] return self._user.get('profile_image_url', None)