class CableImporter(object): """ Reads and parses all available cables and updates the mongodb """ counts = {'files_not_processed': 0} def __init__(self, config, data_directory, overwrite=False, maxcables=None): self.data_directory = join(data_directory, "cable") self.mongodb = CablegateDatabase( config['general']['mongodb'])["cablegate"] if overwrite is True and "cables" in self.mongodb.collection_names(): self.mongodb.drop_collection("cables") self.walk_archive(overwrite, maxcables) def walk_archive(self, overwrite, maxcables): """ Walks the archive directory """ self.cable_list = [] try: for cable in cables_from_directory(self.data_directory): self.process_cable(cable, overwrite) if maxcables is not None and len(self.cable_list) >= maxcables: break except OSError, oserr: logging.error("%s" % oserr)
class CableImporter(object): """ Reads and parses all available cables and updates the mongodb usage : mirror = CableGateMirror(wikileaksdb, 'data/cablegate.wikileaks.org') """ counts = { 'files_not_processed':0 } def __init__(self, config, data_directory, overwrite=False, maxcables=None): self.data_directory = join(data_directory, "cable") self.mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"] if overwrite is True and "cables" in self.mongodb.collection_names(): self.mongodb.drop_collection("cables") self.walk_archive(overwrite, maxcables) def walk_archive(self, overwrite, maxcables): """ Walks the archive directory """ self.cable_list=[] try: for cable in cables_from_directory(self.data_directory): self.process_cable(cable, overwrite) if maxcables is not None and len(self.cable_list) >= maxcables: break except OSError, oserr: logging.error("%s"%oserr)
def __init__(self, config, data_directory, overwrite=False, maxcables=None): self.data_directory = join(data_directory, "cable") self.mongodb = CablegateDatabase( config['general']['mongodb'])["cablegate"] if overwrite is True and "cables" in self.mongodb.collection_names(): self.mongodb.drop_collection("cables") self.walk_archive(overwrite, maxcables)
def extract(config, overwrite=True, maxcables=None): """ gets the all cables from storage then extract ngrams and produce networks edges and weights """ mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"] filters = get_extraction_filters(config) postagger = SequentialPosTagger(None, config['extraction']['tagger']) if overwrite is True and "ngrams" in mongodb.collection_names(): mongodb.drop_collection("ngrams") if overwrite is True and "cooc" in mongodb.collection_names(): mongodb.drop_collection("cooc") count = 0 if maxcables is None: maxcables = mongodb.cables.count() extractionpool = pool.Pool(processes=config['general']['processes']) for cable in mongodb.cables.find(timeout=False): ## just a hack if len(cable['edges']['NGram'].keys()) > 0: continue extractionpool.apply_async( worker, (config, cable, filters, postagger, overwrite)) count += 1 if count >= maxcables: break extractionpool.close() extractionpool.join()
def __init__(self, config, graphtype, minoccs=1, maxcoocs=1, maxcables=None, year=None): self.mongodb = CablegateDatabase( config['general']['mongodb'])["cablegate"] self.graphdb = GraphDatabase(config['general']['neo4j']) self.config = config if graphtype is None or graphtype == "occurrences": self.update_occurrences_network(minoccs, maxcoocs, maxcables, year, documents=False) elif graphtype == "cooccurrences": (nodecache, ngramcache) = self.update_occurrences_network(minoccs, maxcoocs, maxcables, year, documents=False) self.update_cooccurrences_network(nodecache, ngramcache, minoccs, maxcoocs)
def extract(config, overwrite=True, maxcables=None): """ gets the all cables from storage then extract ngrams and produce networks edges and weights """ mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"] filters = get_extraction_filters(config) postagger = SequentialPosTagger(None, config['extraction']['tagger']) if overwrite is True and "ngrams" in mongodb.collection_names(): mongodb.drop_collection("ngrams") if overwrite is True and "cooc" in mongodb.collection_names(): mongodb.drop_collection("cooc") count=0 if maxcables is None: maxcables = mongodb.cables.count() extractionpool = pool.Pool(processes=config['general']['processes']) for cable in mongodb.cables.find(timeout=False): ## just a hack if len(cable['edges']['NGram'].keys())>0: continue extractionpool.apply_async(worker, (config, cable, filters, postagger, overwrite)) count+=1 if count>=maxcables: break extractionpool.close() extractionpool.join()
def worker(config, cable, filters, postagger, overwrite): mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"] if overwrite is True: cable = initEdges(cable) # extract and filter ngrams ngramizer = NGramizer(config) ngramizer.extract(cable, filters, postagger, PorterStemmer()) update_cable_cooc(cable, mongodb) mongodb.cables.update({"_id": cable['_id']}, {"$set": { "edges": cable['edges'] }})
def __init__(self, config, data_directory, overwrite=False, maxcables=None): self.data_directory = join(data_directory, "cable") self.mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"] if overwrite is True and "cables" in self.mongodb.collection_names(): self.mongodb.drop_collection("cables") self.walk_archive(overwrite, maxcables)
def __init__(self, config): self.mongodb = CablegateDatabase( config['general']['mongodb'])["cablegate"] self.config = config