def main(): args = docopt(__doc__) c = get_config() # create new Graph object g = rdflib.Graph() print "Loading files: ", sys.stdout.flush() # parse data files (could take a while: 4,000,000 triples will take ~10 minutes ) for fn in files: print fn, sys.stdout.flush() with gzip.open(os.path.join(c.ttl_dir, fn)) as f: result = g.parse(f, format='turtle') print "\nQuerying..." # compile query qres = g.query(""" PREFIX dc: <http://purl.org/dc/elements/1.1/> PREFIX dcterms: <http://purl.org/dc/terms/> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX lp: <http://purl.org/linkedpolitics/> PREFIX lpv: <http://purl.org/linkedpolitics/vocabulary/> PREFIX xml: <http://www.w3.org/XML/1998/namespace> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> PREFIX foaf: <http://xmlns.com/foaf/0.1/> SELECT ?date ?speaker ?firstname ?lastname ?country ?text WHERE { ?sessionday dcterms:hasPart ?agendaitem. ?sessionday dc:date ?date. ?agendaitem dcterms:hasPart ?speech. ?speech lpv:speaker ?speaker. ?speaker lpv:countryOfRepresentation ?countryobj. ?countryobj lpv:acronym ?country. ?speaker foaf:givenName ?firstname. ?speaker foaf:familyName ?lastname. ?speech lpv:text ?text. } """) # The query is actually executed now, it takes a while (~3 min) print "Found %d records" % len(qres) # Write out csv file with gzip.open(os.path.join(c.textdb_dir, 'English.csv.gz'), 'wb') as csvfile: csv_headers = ['Date', 'SpeakerURI', 'Firstname', 'Lastname', 'Country', 'Speech'] speechwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) speechwriter.writerow(csv_headers) for row in progress.bar(qres, label='Writing CSV ', expected_size=len(qres), every=1000): csv_line = [x.encode('utf8').strip() for x in row] speechwriter.writerow(csv_line) print 'Done'
def main(): args = docopt(__doc__) feature_name = args['<feature_name>'] assert feature_name == 'words' assert args['<experimentset_name>'] in EXPERIMENT_SETS, '<experimentset_name> must be one of %s' % str(EXPERIMENT_SETS.keys()) c = get_config() experiment_set = EXPERIMENT_SETS[args['<experimentset_name>']](feature_name=feature_name) print "Computing foreground group sums using %d cores..." % c.num_cores pool = Pool(c.num_cores, init_worker) fg_groups = experiment_set.list_foreground_groups() cache = {} try: for group_name, sum_vector in progress.bar(pool.imap_unordered(ComputeForegroundGroupSumCallable(experiment_set), fg_groups), label="Progress ", expected_size=len(fg_groups)): cache[group_name] = sum_vector except KeyboardInterrupt: print "Terminating pool.." pool.terminate() pool.join() print "Computing background sums..." bg_groups = experiment_set.list_background_groups() for g in bg_groups: sum_vector = experiment_set.compute_background_group_sum(g, cache) cache[g] = sum_vector print "Saving sums to ZODB..." zodb_root = open_zodb(read_only=False) if getattr(zodb_root, 'group_sums', None) is None: zodb_root.group_sums = BTrees.OOBTree.OOBTree() transaction.commit() if feature_name not in zodb_root.group_sums: zodb_root.group_sums[feature_name] = BTrees.OOBTree.OOBTree() transaction.commit() for k, v in cache.iteritems(): zodb_root.group_sums[feature_name][k] = v transaction.commit() print "Creating output db tables..." create_db(c.resultsdb_url) session_out = open_db(c.resultsdb_url) print "Computing overrepresentation using %d cores..." % c.num_cores exps = experiment_set.list_experiments() cls = experiment_set.result_table_class() try: for fg, bg, results in progress.bar(pool.imap_unordered(ComputeOverrepresentedWordsCallable(experiment_set), exps), label="Progress ", expected_size=len(exps)): for w, odds, pval in results: c = cls(foreground_group_name=fg, background_group_name=bg, word=w, odds=odds, pval=pval) session_out.add(c) except KeyboardInterrupt: print "Terminating pool.." pool.terminate() pool.join() print "Committing..." session_out.commit() print "Done"
def open_db(db_url = None): "Returns an initialized Session object. If db_url is not specified, uses get_config().db_url" if db_url is None: from talkofeuropedb.config import get_config db_url = get_config().db_url e = create_engine(db_url) Session = sessionmaker(e) return Session()
def main(): args = docopt(__doc__) c = get_config() print "Downloading files into %s" % c.ttl_dir for g in graph_list: target_file = os.path.join(c.ttl_dir, g.split('/')[-1] + '.ttl.gz') source_url = "http://linkedpolitics.ops.few.vu.nl/api/export_graph?graph=%s&mimetype=text%%2Fplain&format=turtle" % urllib.quote(g, '') print "Downloading %s..." % g download_gzipped(source_url, target_file) print "Done"
def open_zodb(config=None, read_only=False): "Opens a Zope database and returns a root object. If config not specified get_config() is used." if config is None: from talkofeuropedb.config import get_config config = get_config() storage = ZODB.FileStorage.FileStorage(os.path.join(config.zodb_dir, 'zodb.fs'), read_only=read_only) db = ZODB.DB(storage) connection = db.open() root = connection.root return root
def main(): args = docopt(__doc__) c = get_config() session = open_db() speeches = session.query(Speech).all() total_speeches = len(speeches) # For progress bar purposes print "Computing using %d cores..." % c.num_cores pool = Pool(c.num_cores, init_worker) try: for id, lang in progress.bar(pool.imap_unordered(detect_language, speeches), label='Progress ', expected_size=total_speeches, every=1000): session.query(Speech).get(id).lang = lang except KeyboardInterrupt: print "Terminating pool.." pool.terminate() pool.join() print "Committing..." session.commit() num_english_texts = session.query(Speech).filter(Speech.lang == 'en').count() print "Done. English texts: %d" % num_english_texts
def main(): args = docopt(__doc__) c = get_config() e = create_engine(c.db_url) Base.metadata.drop_all(e) Base.metadata.create_all(e) Session = sessionmaker(e) s = Session() with gzip.open(os.path.join(c.textdb_dir, 'English.csv.gz'), 'rb') as csv_file: reader = csv.reader(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) reader.next() # Skip header for row in progress.mill(reader, label='Writing to DB ', expected_size=254253, every=1000): sp = Speech(date=datetime.strptime(row[0], '%Y-%m-%d'), speaker_uri=unicode(row[1], 'utf-8'), first_name=unicode(row[2], 'utf-8'), last_name=unicode(row[3], 'utf-8'), country=row[4], speech=unicode(row[5], 'utf-8')) s.add(sp) print "Committing..." s.commit() print "Done"
def main(): args = docopt(__doc__) extractor_name = args['<feature_name>'] extractor = getattr(talkofeuropewords.extract, extractor_name, None) if extractor is None: print "Unknown extractor name" sys.exit(1) c = get_config() s = open_db() print "Preparing ZODB" zodb_root = open_zodb(read_only=False) if getattr(zodb_root, 'features', None) is None: zodb_root.features = BTrees.OOBTree.OOBTree() transaction.commit() if extractor_name not in zodb_root.features: zodb_root.features[extractor_name] = BTrees.OOBTree.OOBTree() transaction.commit() runner = TaskRunner(extractor) print "Querying database..." speeches = s.query(Speech).filter(Speech.lang == 'en').all() total_speeches = len(speeches) print "Computing using %d cores..." % c.num_cores pool = Pool(c.num_cores, init_worker) try: for i, (id, result) in enumerate(progress.bar(pool.imap_unordered(runner, speeches), label='Progress ', expected_size=total_speeches, every=1000), 1): zodb_root.features[extractor_name][id] = result if i % 1000 == 0: transaction.commit() transaction.commit() except KeyboardInterrupt: print "Terminating pool.." pool.terminate() pool.join() print "Done"
def main(): args = docopt(__doc__) c = get_config() session = open_db() print "Finding 5 most active countries" countries = session.query(Speech.country, func.count(Speech.id)).filter(Speech.lang == 'en').group_by(Speech.country).order_by(desc(func.count(Speech.id))).limit(5).all() print countries country_codes = [c[0] for c in countries] print "Collecting words used by each country using 5 cores" pool = Pool(5, init_worker) try: word_sets = pool.map(country_words, country_codes) except KeyboardInterrupt: print "Terminating pool.." pool.terminate() pool.join() print "Collected word sets with sizes: ", map(len, word_sets) print "Computing intersection..." word_set = reduce(lambda x, y: x & y, word_sets) print "Result size: ", len(word_set) print "Subtracting stopwords..." nltk.download('stopwords') langs = ['english', 'dutch', 'french', 'italian', 'portuguese', 'swedish', 'german', 'spanish'] all_stopwords = reduce(lambda x, y: x | y, [set(nltk.corpus.stopwords.words(lng)) for lng in langs]) all_stopwords = set(map(unidecode, all_stopwords)) word_set = word_set - all_stopwords print "Resulting word set size: ", len(word_set) print "Saving..." zodb_root = open_zodb() zodb_root.all_words = word_set transaction.commit() print "Done"
def __init__(self, pval_cutoff=0.01, feature_name='words'): self.pval_cutoff = pval_cutoff self.feature_name = feature_name self.config = get_config()