Python MongoDatabase.update_collection 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: database

클래스/타입: MongoDatabase

메소드/함수: update_collection

hotexamples.com에서의 예제들: 2

Python MongoDatabase.update_collection - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 database.MongoDatabase.update_collection에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

read_collection(12)

MongoDatabase(10)

insert_one_to_collection(6)

add_new_member(2)

connect(2)

query_result_multi(2)

update_collection(2)

add_processed_video(1)

delete_member_by_email(1)

find_all(1)

get_all_records(1)

initSlides(1)

initlialize(1)

terminate(1)

updateSlides(1)

예제 #1

파일 보기

파일: 7_classify_target_tweets.py 프로젝트: lotdcotw/CATSA

    # loop over each target tweet
    for i, d in enumerate(D):

        logging.debug('	- Processing tweet {}/{}'.format(i + 1, D.count()))

        # check if we have a true label for the target tweet, if so, skip prediction and use true label
        if d['tweet_id'] in true_labels:
            # the labels are stored fully written, for example, positive, and we need to go to the coded version, that is 2 for positive
            d['label'] = get_sentiment_code(true_labels[d['tweet_id']])
        else:
            # infer sentiment label from classifier
            d['label'] = int(clf.predict([d['text']])[0][0])

        # update the document in the database
        db.update_collection(collection='target_tweets', doc=d)

        # add to label array
        labels[i] = (d['tweet_id'], d['label'],
                     get_tweet_type_code(d['tweet_type']))

        # increment counter
        # i +=1

    # location for the labels array
    labels_location = os.path.join('files', 'labels')
    # make sure directory exists
    create_directory(labels_location)
    # save labels array
    joblib.dump(labels, os.path.join(labels_location, 'labels.pkl'))

예제 #2

파일 보기

파일: preprocessing.py 프로젝트: shaheen-syed/LDA

class Preprocessing():
    def __init__(self):

        logging.info('Initialized {}'.format(self.__class__.__name__))

        # instantiate database
        self.db = MongoDatabase()

        # set utf8 encoding
        reload(sys)
        sys.setdefaultencoding('utf8')

    def full_text_preprocessing(self, pdf_folder=os.path.join('files', 'pdf')):
        """
			preprocess full-text publications
			- convert pdf to plain text
			- correct for carriage returns
			- correct for end-of-line hyphenation
			- remove boilerplate
			- remove bibliography
			- remove acknowledgements

			Parameters
			----------
			pdf_folder : os.path
				location where PDF documents are stored
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # read pdf files that need to be converted
        F = [x for x in read_directory(pdf_folder) if x[-4:] == '.pdf']

        # read documents from DB that have already been processed so we can skip them
        processed_documents = [
            '{}-{}-{}'.format(x['journal'], x['year'], x['title'])
            for x in self.db.read_collection(collection='publications_raw')
        ]

        # loop over each file and convert pdf to plain and save meta data to DB
        for i, f in enumerate(F):

            # extract meta data from folder structure and file name
            journal = f.split('/')[2]
            year = f.split('/')[3]
            title = f.split('/')[4].replace('-', ' ')[4:-4].strip()

            # console output
            print_doc_verbose(i, len(F), journal, year, title)

            # check if PDF has already been processed
            if '{}-{}-{}'.format(journal, year, title) in processed_documents:
                logging.info('PDF document already processed, skipping ...')
                continue

            # convert content of PDF to plain text
            content = pdf_to_plain(f)

            # check if content could be extracted
            if content is not None:

                # fix soft hyphen
                content = content.replace(u'\xad', "-")
                # fix em-dash
                content = content.replace(u'\u2014', "-")
                # fix en-dash
                content = content.replace(u'\u2013', "-")
                # minus sign
                content = content.replace(u'\u2212', "-")
                # fix hyphenation that occur just before a new line
                content = content.replace('-\n', '')
                # remove new lines/carriage returns
                content = content.replace('\n', ' ')

                # correct for ligatures
                content = content.replace(u'\ufb02', "fl")  # fl ligature
                content = content.replace(u'\ufb01', "fi")  # fi ligature
                content = content.replace(u'\ufb00', "ff")  # ff ligature
                content = content.replace(u'\ufb03', "ffi")  # ffi ligature
                content = content.replace(u'\ufb04', "ffl")  # ffl ligature
                """ 
					Remove boilerplate content:

					Especially journal publications have lots of boilerplate content on the titlepage. Removing of this is specific for each
					journal and you can use some regular expressions to identify and remove it.
				"""
                """
					Remove acknowledgemends and/or references
					This is a somewhat crude example
				"""
                if content.rfind("References") > 0:
                    content = content[:content.rfind("References")]
                """
				 	Remove acknowledgements
				"""
                if content.rfind("Acknowledgment") > 0:
                    content = content[:content.rfind("Acknowledgment")]

                # prepare dictionary to save into MongoDB
                doc = {
                    'journal': journal,
                    'title': title,
                    'year': year,
                    'content': content
                }

                # save to database
                self.db.insert_one_to_collection(doc=doc,
                                                 collection='publications_raw')

    def general_preprocessing(self, min_bigram_count=5):
        """
			General preprocessing of publications (used for abstracts and full-text)

			Parameters
			----------
			min_bigram_count : int (optional)
				frequency of bigram to occur to include into list of bigrams. Thus lower frequency than min_bigram_count will not be included.
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # read document collection
        D = self.db.read_collection(collection='publications_raw')

        # setup spacy natural language processing object
        nlp = setup_spacy()

        # loop through the documents and correct content
        for i, d in enumerate(D):

            # check if tokens are already present, if so, skip
            if d.get('tokens') is None:

                # print to console
                print_doc_verbose(i, D.count(), d['journal'], d['year'],
                                  d['title'])

                # get content from document and convert to spacy object
                content = nlp(d['content'])

                # tokenize, lemmatization, remove punctuation, remove single character words
                unigrams = word_tokenizer(content)

                # get entities
                entities = named_entity_recognition(content)

                # get bigrams
                bigrams = get_bigrams(" ".join(unigrams))
                bigrams = [['{} {}'.format(x[0], x[1])] * y
                           for x, y in Counter(bigrams).most_common()
                           if y >= min_bigram_count]
                bigrams = list(itertools.chain(*bigrams))

                d['tokens'] = unigrams + bigrams + entities

                # save dictionary to datbase
                self.db.update_collection(collection='publications_raw', doc=d)

            else:
                logging.debug('Document already tokenized, skipping ...')