def calculate_text_te(n_samples_list=None, normalised=True):
    n_users, n_dims = 12, 50
    if not n_samples_list:
        n_samples_list = get_n_samples_list()

    # n_samples_list = n_samples_list[2:8]
    print(n_samples_list)
    for n_samples in n_samples_list:
        data_info = {
            'n_users': n_users,
            'n_samples': n_samples,
            'n_dims': n_dims
        }

        # use the dict of data info defined above.
        tp = TextProcessor(data_info['n_users'], data_info['n_samples'],
                           data_info['n_dims'])

        data = tp.load_vec('lda')
        print(data.shape)
        cn, te_mat = calculate_te(data, 'lda', normalised=normalised)
        print('te_lda_%d_%d_%d' %
              (data_info['n_users'], data_info['n_samples'],
               data_info['n_dims']))
        print(cn)
        print(te_mat)
Пример #2
0
def parse_documents():
    arxiv_parser = ArxivParser(TextProcessor())
    springer_parser = SpringerParser(TextProcessor())

    for document in Document.select(lambda doc: not doc.is_processed):
        if "arxiv.org" in urlparse(document.url)[1]:
            cur_parser = arxiv_parser
        elif "springer.com" in urlparse(document.url)[1]:
            cur_parser = springer_parser
        else:
            continue

        page = WebPage.from_disk(document.url, document.file_path)

        if document.document_hash != page.page_hash:
            Document[document.id].delete()
            continue

        parsed = cur_parser.parse(page)
        document.is_processed = True
        commit()

        logging.debug(("Article: {}" if parsed else "{}").format(document.url))
Пример #3
0
def setup_ranker():
    global ranker

    text_processor = TextProcessor()
    docs = []
    index = InvertedIndex.load(INDEX_FOLDER, "inverted_index")
    articles = select(article.id for article in Article)
    for article_id in articles:
        article = Article[article_id]
        docs.append(
            AbstractAndArticle(article,
                               _read_file(article.processed_abstract_path)))

    ranker = TfIdf(index, text_processor, docs, VECTORS_PER_FILE,
                   VECTORS_SAVE_FOLDER)
Пример #4
0
def run_rank():
    text_processor = TextProcessor()
    docs = []
    index = InvertedIndex.load(INDEX_FOLDER, "inverted_index")
    articles = select(article.id for article in Article)
    for article_id in articles:
        article = Article[article_id]
        docs.append(AbstractAndArticle(article, _read_file(article.processed_abstract_path)))

    ranker = TfIdf(index,
                   text_processor,
                   docs,
                   vectors_per_file=VECTORS_PER_FILE,
                   vectors_save_folder=VECTORS_SAVE_FOLDER)

    while True:
        query = input("Enter query: ")
        top_ids = ranker.rank(query, 5)
        for article_id in top_ids:
            article = Article[article_id]
            print(article.title, article.document.url)
Пример #5
0
    logging.info('Starting Application...')

    #for key in logging.Logger.manager.loggerDict:
    #    logging.getLogger(key).setLevel(logging.CRITICAL)

    # Initialize Threads
    streamer = Streamer(name='Streamer',
                        keywords=seed_keywords,
                        credentials=credentials['coll_1'],
                        tp_queue=text_processor_queue,
                        filter_params=filters,
                        kw_queue=keyword_queue,
                        limit_queue=lim_queue,
                        message_queue=mess_queue)
    text_processor = TextProcessor(name='Text Processor',
                                   database=db,
                                   tp_queue=text_processor_queue,
                                   dictionary=d)
    annotator = Annotator(name='Annotator',
                          database=db,
                          train_event=te,
                          annotation_response=annot_resp,
                          socket=socketio,
                          train_threshold=n_before_train,
                          message_queue=mess_queue)
    classifier = Classifier(name='Classifier',
                            database=db,
                            model=model_queue,
                            dictionary=d)
    monitor = Monitor(name='Monitor',
                      database=db,
                      socket=socketio,
Пример #6
0
    parser.add_argument('--train_file',
                        type=str,
                        required=True,
                        help='raw training corpus path')
    parser.add_argument('--seg_train_file',
                        type=str,
                        required=True,
                        help='segmented training data path')
    return parser.parse_args()


def seg_file(tp, train_file, seg_train_file):
    fw = open(seg_train_file, 'w')
    with open(train_file) as f:
        for line in f:
            cmsid, label, title, content = line.strip('\n').split('\t')
            fw.write(cmsid + '\t' + label + '\t' +
                     ' '.join(tp.seg_text(title)[0]) + '\t' +
                     ' '.join(tp.seg_text(content)[0]) + '\n')
    fw.close()


if __name__ == '__main__':
    args = process_args()
    print "train_file:" + args.train_file
    print "train_file_seg:" + args.seg_train_file
    tp = TextProcessor("/data/ainlp/classification/data/new_first_data/")
    seg_file(tp, args.train_file, args.seg_train_file)

    # TODO: segment raw training file and save result.
                    seg_title = ' '.join(tp.seg_text(title)[0])
                    out_fields.append(seg_title)
                elif i == cont_col:
                    cont = fields[i]
                    cont = tp.replace_punc(cont, '。')
                    cont = tp.filter_affix(cont)
                    cont = tp.truncate(cont)
                    cont = cont.encode('utf8')
                    seg_cont = ' '.join(tp.seg_text(cont)[0])
                    out_fields.append(seg_cont)
                else:
                    out_fields.append(fields[i])

            fout.write('%s\n' % '\t'.join(out_fields))
    fout.close()


if __name__ == '__main__':
    args = process_args()

    tp = TextProcessor('/data/ainlp/classification/data/')

    logging.info('Start segmenting corpus ...')
    logging.info('raw_corpus: %s' % args.raw_corpus)
    logging.info('seg_corpus: %s' % args.seg_corpus)

    segment_file(tp, args.raw_corpus, args.seg_corpus, args.title_col,
                 args.cont_col)

    logging.info('Finished')
Пример #8
0
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s (%(threadName)s) %(message)s',
                        filename='debug.log')

    logging.info('\n' * 5)
    logging.info('*' * 10 + 'ACTIVE STREAM' + '*' * 10)
    logging.info('Starting Application...')

    logging.getLogger('socketio').setLevel(logging.ERROR)
    logging.getLogger('werkzeug').setLevel(logging.ERROR)

    # Initialize Threads
    streamer = Streamer(credentials_track=credentials['coll_1'],
                        credentials_sample=credentials['main_account'],
                        data=data)
    text_processor = TextProcessor(data)
    annotator = Annotator(train_threshold=n_before_train, data=data)
    classifier = Classifier(data)
    monitor = Monitor(streamer=streamer,
                      classifier=classifier,
                      annotator=annotator,
                      data=data)
    trainer = Trainer(data=data,
                      streamer=streamer,
                      clf=SGDClassifier(loss='log', penalty='l1', alpha=0.001))

    threads = [
        streamer, text_processor, monitor, classifier, trainer, annotator
    ]

    socketio.run(app, debug=False)