def test_empty_description(self): self.assertRaises(ValidationError, main_models.Document(description='').clean_fields) self.assertRaises(ValidationError, main_models.Document(description=' ').clean_fields) self.assertRaises(ValidationError, main_models.Document(description=' .').clean_fields) self.assertRaises( ValidationError, main_models.Document(description='<div> .</div>').clean_fields) self.assertRaises( ValidationError, main_models.Document(description='&emdash;').clean_fields)
def process(docs, session): for doc_object in docs: json_object = doc_object['columns'] doc_id = json_object['itemid'] # todo: pass this to logger print(doc_id) html = retrieve_html(doc_id) case_name = process_case_name(json_object['docname']) doc = models.Document( id=doc_id, scl=json_object['scl'], html=html, case=json_object['appno'], date=parse_date(json_object['kpdate']), case_name=case_name, tags=json_object['documentcollectionid2'], violations=json_object['violation'], nonviolations=json_object['nonviolation'], ) for article_id in parse_articles(json_object): article = get_or_create(session, models.Article, id=article_id) doc.articles.append(article) # merge: if doc already exists in db, update it. session.merge(doc) session.commit()
def process(documents, config, sent_type=constants.SENTENCE_TYPE_GENERAL): """ :param dict documents: format(id => content) :param int sent_type: type of sentence :param dict config: keys are 'segmenter', 'tokenizer', 'options' :return: list of Document """ assert config.__class__.__name__ == 'dict', '"config" must be a dict.' doc_objects = [] segmenter = config.get(opt.SEGMENTER_KEY, opt.SpacySegmenter()) tokenizer = config.get(opt.TOKENIZER_KEY, opt.SpacyTokenizer()) for i in documents: doc_obj = models.Document(id=i, content=(documents[i])) doc_obj.sentences = [] raw_sentences = segmenter.segment(doc_obj.content) current_pos = 0 for s in raw_sentences: start_offset = documents[i].find(s, current_pos) end_offset = start_offset + len(s) sent_obj = __parse_sentence(s, (start_offset, end_offset), tokenizer) sent_obj.type = sent_type current_pos = end_offset doc_obj.sentences.append(sent_obj) doc_objects.append(doc_obj) optional = config.get(opt.OPTION_KEY, []) for o in optional: for doc_obj in doc_objects: o.process(doc_obj) return doc_objects
def add_documents(name: str, event_ids: List[int], tweet_urls: Dict[int, models.URL], session): uf = UnionFind() tweets = session.query(models.Tweet).filter( models.Tweet.event_id_id.in_(event_ids)).all() for tweet in tqdm(tweets, desc="Iterating over tweets (create sets)"): uf.make_set(tweet.tweet_id) url_obj = tweet_urls.get(tweet.tweet_id) if url_obj: uf.make_set(url_obj.expanded_url) for tweet in tqdm(tweets, desc="Iterating over tweets (join sets)"): if tweet.in_reply_to_status_id: uf.union(tweet.tweet_id, int(tweet.in_reply_to_status_id)) if tweet.retweet_of_id: uf.union(tweet.tweet_id, int(tweet.retweet_of_id)) url_obj = tweet_urls.get(tweet.tweet_id) if url_obj: uf.union(tweet.tweet_id, url_obj.expanded_url) with session.begin(): group_doc = dict() groups = map(lambda g: str(uf.find(g)), uf.groups) for rep in groups: document = models.Document(url=rep) group_doc[rep] = document for tweet in tqdm(tweets, desc="Iterating over tweets (set documents)"): id = str(uf.find(tweet.tweet_id)) doc = group_doc[id] tweet.document = doc return uf
def get_features(args, mode): fname = os.path.join('../cache', 'cached_%s_%s.pkl' % (args.bert_type, mode)) features = models.Features(args.bert_type) if os.path.exists(fname): logger.info('Loading features from cached file %s' % fname) features.load_from_cache(fname) else: logger.info('Building cached feature file %s' % fname) dlist = get_docnames(args.docs) docs = [models.Document(x) for x in dlist] if mode == 'train': docs = docs[:int(len(docs) * 0.8)] else: docs = docs[int(len(docs) * 0.8):] features.load_from_docs(docs) features.save_to_cache(fname) return features
def test_duplicate_descriptions(self): data = self.document_kwargs.copy() data['description'] = u'“My Disillusionment in Russia”' test_document = main_models.Document(**data) self.assertRaises(ValidationError, test_document.full_clean) self.assertRaises(IntegrityError, test_document.save)