def worker(args): rc_dir, out_dir, hours, event, ad_dir, log_file = args vct_pkl = os.path.join(ad_dir, 'article_vectorizer.pkl') clf_pkl = os.path.join(ad_dir, 'article_clf.pkl') artcl_detect = ArticleDetector(vct_pkl, clf_pkl, event) lgf = open(log_file, 'w') n_hours = len(hours) for h, hour in enumerate(hours, 1): n_docs = 0 n_sents = 0 n_rel_docs = 0 n_rel_sents = 0 #print u'({}/{}) hour: {}'.format(h, n_hours, hour) chunk = os.path.join(rc_dir, '{}.sc.gz'.format(hour)) opath = str(os.path.join(out_dir, '{}.sc.gz'.format(hour))) ochunk = sc.Chunk(path=opath, mode='wb') try: for si_idx, si in enumerate(sc.Chunk(path=chunk)): n_docs += 1 if u'serif' in si.body.sentences: annotator = u'serif' elif u'lingpipe' in si.body.sentences: annotator = u'lingpipe' else: continue n_sents += len(si.body.sentences[annotator]) sent_idxs = artcl_detect.find_articles(si, annotator) n_idxs = len(sent_idxs) if n_idxs > 0: n_rel_docs += 1 n_rel_sents += n_idxs rel_sents = [] for sent_idx in sent_idxs: rel_sents.append(si.body.sentences[annotator][sent_idx]) si.body.sentences['article-clf'] = rel_sents ochunk.add(si) ochunk.close() lgf.write('{}\t{}\t{}\t{}\t{}\n'.format(hour, n_docs, n_sents, n_rel_docs, n_rel_sents)) lgf.flush() except IOError, e: print str(e)
def worker(args): rc_dir, out_dir, hours, event, ad_dir, log_file = args vct_pkl = os.path.join(ad_dir, 'article_vectorizer.pkl') clf_pkl = os.path.join(ad_dir, 'article_clf.pkl') artcl_detect = ArticleDetector(vct_pkl, clf_pkl, event) lgf = open(log_file, 'w') n_hours = len(hours) for h, hour in enumerate(hours, 1): n_docs = 0 n_sents = 0 n_rel_docs = 0 n_rel_sents = 0 #print u'({}/{}) hour: {}'.format(h, n_hours, hour) chunk = os.path.join(rc_dir, '{}.sc.gz'.format(hour)) opath = str(os.path.join(out_dir, '{}.sc.gz'.format(hour))) ochunk = sc.Chunk(path=opath, mode='wb') try: for si_idx, si in enumerate(sc.Chunk(path=chunk)): n_docs += 1 if u'serif' in si.body.sentences: annotator = u'serif' elif u'lingpipe' in si.body.sentences: annotator = u'lingpipe' else: continue n_sents += len(si.body.sentences[annotator]) sent_idxs = artcl_detect.find_articles(si, annotator) n_idxs = len(sent_idxs) if n_idxs > 0: n_rel_docs += 1 n_rel_sents += n_idxs rel_sents = [] for sent_idx in sent_idxs: rel_sents.append( si.body.sentences[annotator][sent_idx]) si.body.sentences['article-clf'] = rel_sents ochunk.add(si) ochunk.close() lgf.write('{}\t{}\t{}\t{}\t{}\n'.format(hour, n_docs, n_sents, n_rel_docs, n_rel_sents)) lgf.flush() except IOError, e: print str(e)
def _article_resource_worker(job_queue, result_queue, **kwargs): signal.signal(signal.SIGINT, signal.SIG_IGN) event = kwargs.get(u'event') corpus = kwargs.get(u'corpus') while not job_queue.empty(): try: opath, chunk_paths = job_queue.get(block=False) artcl_detect = ArticleDetector(event) patt = event.regex_pattern() with sc.Chunk(path=opath, mode='wb', message=corpus.sc_msg()) as ochunk: for path in chunk_paths: for si in sc.Chunk(path=path, message=corpus.sc_msg()): if si.body.clean_visible is None: continue elif patt.search(si.body.clean_visible, re.I): #if corpus.annotator() not in si.body.sentences: # continue sentences = corpus.get_sentences(si) sent_idxs = artcl_detect.find_articles( sentences) if len(sent_idxs) > 0: rel_sents = [] for sent_idx in sent_idxs: #for token in sentences[sent_idx].tokens: # print token.token, #print rel_sents.append(sentences[sent_idx]) si.body.sentences[u'article-clf'] = rel_sents ochunk.add(si) result_queue.put(None) except Queue.Empty: pass