def search_subworker(config, content, year, doublet): """ Responsible for matching the pair and incrementing cooccurrences count """ logging.debug("looking for cooc of %s and %s"%(doublet[0]['label'], doublet[1]['label'])) outputs = output.getConfiguredOutputs(config['cooccurrences']) regex1 = re.compile( r"\b%s\b"%"|".join(doublet[0]['edges']['label'].keys()), re.I|re.M|re.U ) regex2 = re.compile( r"\b%s\b"%"|".join(doublet[1]['edges']['label'].keys()), re.I|re.M|re.U ) if regex1.search(content) is not None and regex2.search(content) is not None: logging.debug("found a cooc !") # will look for both composed ID doublet_id12 = year\ +"_"+ doublet[0]["id"]\ +"_"+ doublet[1]["id"] doublet_id21 = year\ +"_"+ doublet[1]["id"]\ +"_"+ doublet[0]["id"] if outputs['mongodb'].mongodb.coocmatrix.find_one({'_id':doublet_id12}) is not None: outputs['mongodb'].mongodb.coocmatrix.update(\ {'_id': doublet_id12},\ {'_id': doublet_id12, '$inc':\ {'value': 1}}, upsert=True) elif outputs['mongodb'].mongodb.coocmatrix.find_one({'_id':doublet_id21}) is not None: outputs['mongodb'].mongodb.coocmatrix.update(\ {'_id': doublet_id21},\ {'_id': doublet_id21, '$inc':\ {'value': 1}}, upsert=True) else: # anyway saves a new cooc line using 'id12' ID outputs['mongodb'].mongodb.coocmatrix.save(\ {'_id': doublet_id12, 'value': 1})
def main(config): """ main occurrences processor reads a whitelist and push a occurrences_worker() to a process pool """ whitelistpath = config['cooccurrences']["whitelist"]["path"] logging.debug("loading whitelist from %s (id = %s)"%(whitelistpath, whitelistpath)) wlimport = Reader('whitelist://'+whitelistpath, dialect="excel", encoding="ascii") wlimport.whitelist = whitelist.Whitelist( whitelistpath, whitelistpath ) newwl = wlimport.parse_file() newwl['content']=[] # cursor of Whitelist NGrams db ngramgenerator = newwl.getNGram() outputs = output.getConfiguredOutputs(config['cooccurrences']) try: while 1: ngid, ng = ngramgenerator.next() newwl['content'] += [ng] outputs['exportwhitelistcsv'].save("%s,%s\n"%(ngid,ng['label'])) #raise StopIteration() except StopIteration: logging.debug('imported %d n-lemmes from the whitelist file %s'\ %(len(newwl['content']), whitelistpath)) input = mongodbhandler.MongoDB(config['cooccurrences']['input_db']) #occspool = pool.Pool(processes=config['processes']) for notice in input.notices.find(timeout=False): #occspool.apply_async(worker, (config, notice, newwl)) worker(config, notice, newwl)
def exportcooc(config): """ Basic exporter of the cooccurrences stored to files """ outputs = output.getConfiguredOutputs(config['cooccurrences']) for pair in outputs['mongodb'].mongodb.coocmatrix.find(): year, ngi, ngj = pair['_id'].split("_") cooc = pair['value'] outputs['coocmatrixcsv'].save("%s,%s,%d,%s\n"%(ngi, ngj, cooc, year))
def extract_worker(config, fieldname): """ copies input db notices matching a regexg to an output db """ input = mongodbhandler.MongoDB(config['extractor']['input_db']) outputs = output.getConfiguredOutputs( config['extractor'] ) reg = re.compile( config['extractor']['filters']['regexp_content']['regexp'], re.I|re.U|re.M) for notice in input.notices.find({ fieldname:{"$regex":reg} }, timeout=False): outputs['mongodb'].save(notice, "notices")