def calculate_text_te(n_samples_list=None, normalised=True): n_users, n_dims = 12, 50 if not n_samples_list: n_samples_list = get_n_samples_list() # n_samples_list = n_samples_list[2:8] print(n_samples_list) for n_samples in n_samples_list: data_info = { 'n_users': n_users, 'n_samples': n_samples, 'n_dims': n_dims } # use the dict of data info defined above. tp = TextProcessor(data_info['n_users'], data_info['n_samples'], data_info['n_dims']) data = tp.load_vec('lda') print(data.shape) cn, te_mat = calculate_te(data, 'lda', normalised=normalised) print('te_lda_%d_%d_%d' % (data_info['n_users'], data_info['n_samples'], data_info['n_dims'])) print(cn) print(te_mat)
def parse_documents(): arxiv_parser = ArxivParser(TextProcessor()) springer_parser = SpringerParser(TextProcessor()) for document in Document.select(lambda doc: not doc.is_processed): if "arxiv.org" in urlparse(document.url)[1]: cur_parser = arxiv_parser elif "springer.com" in urlparse(document.url)[1]: cur_parser = springer_parser else: continue page = WebPage.from_disk(document.url, document.file_path) if document.document_hash != page.page_hash: Document[document.id].delete() continue parsed = cur_parser.parse(page) document.is_processed = True commit() logging.debug(("Article: {}" if parsed else "{}").format(document.url))
def setup_ranker(): global ranker text_processor = TextProcessor() docs = [] index = InvertedIndex.load(INDEX_FOLDER, "inverted_index") articles = select(article.id for article in Article) for article_id in articles: article = Article[article_id] docs.append( AbstractAndArticle(article, _read_file(article.processed_abstract_path))) ranker = TfIdf(index, text_processor, docs, VECTORS_PER_FILE, VECTORS_SAVE_FOLDER)
def run_rank(): text_processor = TextProcessor() docs = [] index = InvertedIndex.load(INDEX_FOLDER, "inverted_index") articles = select(article.id for article in Article) for article_id in articles: article = Article[article_id] docs.append(AbstractAndArticle(article, _read_file(article.processed_abstract_path))) ranker = TfIdf(index, text_processor, docs, vectors_per_file=VECTORS_PER_FILE, vectors_save_folder=VECTORS_SAVE_FOLDER) while True: query = input("Enter query: ") top_ids = ranker.rank(query, 5) for article_id in top_ids: article = Article[article_id] print(article.title, article.document.url)
logging.info('Starting Application...') #for key in logging.Logger.manager.loggerDict: # logging.getLogger(key).setLevel(logging.CRITICAL) # Initialize Threads streamer = Streamer(name='Streamer', keywords=seed_keywords, credentials=credentials['coll_1'], tp_queue=text_processor_queue, filter_params=filters, kw_queue=keyword_queue, limit_queue=lim_queue, message_queue=mess_queue) text_processor = TextProcessor(name='Text Processor', database=db, tp_queue=text_processor_queue, dictionary=d) annotator = Annotator(name='Annotator', database=db, train_event=te, annotation_response=annot_resp, socket=socketio, train_threshold=n_before_train, message_queue=mess_queue) classifier = Classifier(name='Classifier', database=db, model=model_queue, dictionary=d) monitor = Monitor(name='Monitor', database=db, socket=socketio,
parser.add_argument('--train_file', type=str, required=True, help='raw training corpus path') parser.add_argument('--seg_train_file', type=str, required=True, help='segmented training data path') return parser.parse_args() def seg_file(tp, train_file, seg_train_file): fw = open(seg_train_file, 'w') with open(train_file) as f: for line in f: cmsid, label, title, content = line.strip('\n').split('\t') fw.write(cmsid + '\t' + label + '\t' + ' '.join(tp.seg_text(title)[0]) + '\t' + ' '.join(tp.seg_text(content)[0]) + '\n') fw.close() if __name__ == '__main__': args = process_args() print "train_file:" + args.train_file print "train_file_seg:" + args.seg_train_file tp = TextProcessor("/data/ainlp/classification/data/new_first_data/") seg_file(tp, args.train_file, args.seg_train_file) # TODO: segment raw training file and save result.
seg_title = ' '.join(tp.seg_text(title)[0]) out_fields.append(seg_title) elif i == cont_col: cont = fields[i] cont = tp.replace_punc(cont, '。') cont = tp.filter_affix(cont) cont = tp.truncate(cont) cont = cont.encode('utf8') seg_cont = ' '.join(tp.seg_text(cont)[0]) out_fields.append(seg_cont) else: out_fields.append(fields[i]) fout.write('%s\n' % '\t'.join(out_fields)) fout.close() if __name__ == '__main__': args = process_args() tp = TextProcessor('/data/ainlp/classification/data/') logging.info('Start segmenting corpus ...') logging.info('raw_corpus: %s' % args.raw_corpus) logging.info('seg_corpus: %s' % args.seg_corpus) segment_file(tp, args.raw_corpus, args.seg_corpus, args.title_col, args.cont_col) logging.info('Finished')
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s (%(threadName)s) %(message)s', filename='debug.log') logging.info('\n' * 5) logging.info('*' * 10 + 'ACTIVE STREAM' + '*' * 10) logging.info('Starting Application...') logging.getLogger('socketio').setLevel(logging.ERROR) logging.getLogger('werkzeug').setLevel(logging.ERROR) # Initialize Threads streamer = Streamer(credentials_track=credentials['coll_1'], credentials_sample=credentials['main_account'], data=data) text_processor = TextProcessor(data) annotator = Annotator(train_threshold=n_before_train, data=data) classifier = Classifier(data) monitor = Monitor(streamer=streamer, classifier=classifier, annotator=annotator, data=data) trainer = Trainer(data=data, streamer=streamer, clf=SGDClassifier(loss='log', penalty='l1', alpha=0.001)) threads = [ streamer, text_processor, monitor, classifier, trainer, annotator ] socketio.run(app, debug=False)