예제 #1
0
 def test_empty_description(self):
     self.assertRaises(ValidationError,
                       main_models.Document(description='').clean_fields)
     self.assertRaises(ValidationError,
                       main_models.Document(description=' ').clean_fields)
     self.assertRaises(ValidationError,
                       main_models.Document(description=' .').clean_fields)
     self.assertRaises(
         ValidationError,
         main_models.Document(description='<div> .</div>').clean_fields)
     self.assertRaises(
         ValidationError,
         main_models.Document(description='&emdash;').clean_fields)
예제 #2
0
def process(docs, session):
    for doc_object in docs:
        json_object = doc_object['columns']
        doc_id = json_object['itemid']
        # todo: pass this to logger
        print(doc_id)

        html = retrieve_html(doc_id)

        case_name = process_case_name(json_object['docname'])

        doc = models.Document(
            id=doc_id,
            scl=json_object['scl'],
            html=html,
            case=json_object['appno'],
            date=parse_date(json_object['kpdate']),
            case_name=case_name,
            tags=json_object['documentcollectionid2'],
            violations=json_object['violation'],
            nonviolations=json_object['nonviolation'],
        )

        for article_id in parse_articles(json_object):
            article = get_or_create(session, models.Article, id=article_id)
            doc.articles.append(article)

        # merge: if doc already exists in db, update it.
        session.merge(doc)
        session.commit()
예제 #3
0
def process(documents, config, sent_type=constants.SENTENCE_TYPE_GENERAL):
    """
    :param dict documents: format(id => content)
    :param int sent_type: type of sentence
    :param dict config: keys are 'segmenter', 'tokenizer', 'options'
    :return: list of Document
    """
    assert config.__class__.__name__ == 'dict', '"config" must be a dict.'
    doc_objects = []
    segmenter = config.get(opt.SEGMENTER_KEY, opt.SpacySegmenter())
    tokenizer = config.get(opt.TOKENIZER_KEY, opt.SpacyTokenizer())
    for i in documents:
        doc_obj = models.Document(id=i, content=(documents[i]))
        doc_obj.sentences = []
        raw_sentences = segmenter.segment(doc_obj.content)
        current_pos = 0
        for s in raw_sentences:
            start_offset = documents[i].find(s, current_pos)
            end_offset = start_offset + len(s)
            sent_obj = __parse_sentence(s, (start_offset, end_offset), tokenizer)
            sent_obj.type = sent_type
            current_pos = end_offset
            doc_obj.sentences.append(sent_obj)

        doc_objects.append(doc_obj)

    optional = config.get(opt.OPTION_KEY, [])
    for o in optional:
        for doc_obj in doc_objects:
            o.process(doc_obj)

    return doc_objects
예제 #4
0
파일: set_db.py 프로젝트: mquezada/ams
def add_documents(name: str, event_ids: List[int],
                  tweet_urls: Dict[int, models.URL], session):
    uf = UnionFind()

    tweets = session.query(models.Tweet).filter(
        models.Tweet.event_id_id.in_(event_ids)).all()
    for tweet in tqdm(tweets, desc="Iterating over tweets (create sets)"):
        uf.make_set(tweet.tweet_id)

        url_obj = tweet_urls.get(tweet.tweet_id)
        if url_obj:
            uf.make_set(url_obj.expanded_url)

    for tweet in tqdm(tweets, desc="Iterating over tweets (join sets)"):
        if tweet.in_reply_to_status_id:
            uf.union(tweet.tweet_id, int(tweet.in_reply_to_status_id))
        if tweet.retweet_of_id:
            uf.union(tweet.tweet_id, int(tweet.retweet_of_id))

        url_obj = tweet_urls.get(tweet.tweet_id)
        if url_obj:
            uf.union(tweet.tweet_id, url_obj.expanded_url)

    with session.begin():
        group_doc = dict()
        groups = map(lambda g: str(uf.find(g)), uf.groups)
        for rep in groups:
            document = models.Document(url=rep)
            group_doc[rep] = document

        for tweet in tqdm(tweets,
                          desc="Iterating over tweets (set documents)"):
            id = str(uf.find(tweet.tweet_id))
            doc = group_doc[id]

            tweet.document = doc

    return uf
예제 #5
0
def get_features(args, mode):
    fname = os.path.join('../cache',
                         'cached_%s_%s.pkl' % (args.bert_type, mode))
    features = models.Features(args.bert_type)

    if os.path.exists(fname):
        logger.info('Loading features from cached file %s' % fname)
        features.load_from_cache(fname)
    else:
        logger.info('Building cached feature file %s' % fname)

        dlist = get_docnames(args.docs)
        docs = [models.Document(x) for x in dlist]

        if mode == 'train':
            docs = docs[:int(len(docs) * 0.8)]
        else:
            docs = docs[int(len(docs) * 0.8):]

        features.load_from_docs(docs)
        features.save_to_cache(fname)

    return features
예제 #6
0
 def test_duplicate_descriptions(self):
     data = self.document_kwargs.copy()
     data['description'] = u'“My Disillusionment in Russia”'
     test_document = main_models.Document(**data)
     self.assertRaises(ValidationError, test_document.full_clean)
     self.assertRaises(IntegrityError, test_document.save)