예제 #1
0
    # loop over each target tweet
    for i, d in enumerate(D):

        logging.debug('	- Processing tweet {}/{}'.format(i + 1, D.count()))

        # check if we have a true label for the target tweet, if so, skip prediction and use true label
        if d['tweet_id'] in true_labels:
            # the labels are stored fully written, for example, positive, and we need to go to the coded version, that is 2 for positive
            d['label'] = get_sentiment_code(true_labels[d['tweet_id']])
        else:
            # infer sentiment label from classifier
            d['label'] = int(clf.predict([d['text']])[0][0])

        # update the document in the database
        db.update_collection(collection='target_tweets', doc=d)

        # add to label array
        labels[i] = (d['tweet_id'], d['label'],
                     get_tweet_type_code(d['tweet_type']))

        # increment counter
        # i +=1

    # location for the labels array
    labels_location = os.path.join('files', 'labels')
    # make sure directory exists
    create_directory(labels_location)
    # save labels array
    joblib.dump(labels, os.path.join(labels_location, 'labels.pkl'))
예제 #2
0
class Preprocessing():
    def __init__(self):

        logging.info('Initialized {}'.format(self.__class__.__name__))

        # instantiate database
        self.db = MongoDatabase()

        # set utf8 encoding
        reload(sys)
        sys.setdefaultencoding('utf8')

    def full_text_preprocessing(self, pdf_folder=os.path.join('files', 'pdf')):
        """
			preprocess full-text publications
			- convert pdf to plain text
			- correct for carriage returns
			- correct for end-of-line hyphenation
			- remove boilerplate
			- remove bibliography
			- remove acknowledgements

			Parameters
			----------
			pdf_folder : os.path
				location where PDF documents are stored
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # read pdf files that need to be converted
        F = [x for x in read_directory(pdf_folder) if x[-4:] == '.pdf']

        # read documents from DB that have already been processed so we can skip them
        processed_documents = [
            '{}-{}-{}'.format(x['journal'], x['year'], x['title'])
            for x in self.db.read_collection(collection='publications_raw')
        ]

        # loop over each file and convert pdf to plain and save meta data to DB
        for i, f in enumerate(F):

            # extract meta data from folder structure and file name
            journal = f.split('/')[2]
            year = f.split('/')[3]
            title = f.split('/')[4].replace('-', ' ')[4:-4].strip()

            # console output
            print_doc_verbose(i, len(F), journal, year, title)

            # check if PDF has already been processed
            if '{}-{}-{}'.format(journal, year, title) in processed_documents:
                logging.info('PDF document already processed, skipping ...')
                continue

            # convert content of PDF to plain text
            content = pdf_to_plain(f)

            # check if content could be extracted
            if content is not None:

                # fix soft hyphen
                content = content.replace(u'\xad', "-")
                # fix em-dash
                content = content.replace(u'\u2014', "-")
                # fix en-dash
                content = content.replace(u'\u2013', "-")
                # minus sign
                content = content.replace(u'\u2212', "-")
                # fix hyphenation that occur just before a new line
                content = content.replace('-\n', '')
                # remove new lines/carriage returns
                content = content.replace('\n', ' ')

                # correct for ligatures
                content = content.replace(u'\ufb02', "fl")  # fl ligature
                content = content.replace(u'\ufb01', "fi")  # fi ligature
                content = content.replace(u'\ufb00', "ff")  # ff ligature
                content = content.replace(u'\ufb03', "ffi")  # ffi ligature
                content = content.replace(u'\ufb04', "ffl")  # ffl ligature
                """ 
					Remove boilerplate content:

					Especially journal publications have lots of boilerplate content on the titlepage. Removing of this is specific for each
					journal and you can use some regular expressions to identify and remove it.
				"""
                """
					Remove acknowledgemends and/or references
					This is a somewhat crude example
				"""
                if content.rfind("References") > 0:
                    content = content[:content.rfind("References")]
                """
				 	Remove acknowledgements
				"""
                if content.rfind("Acknowledgment") > 0:
                    content = content[:content.rfind("Acknowledgment")]

                # prepare dictionary to save into MongoDB
                doc = {
                    'journal': journal,
                    'title': title,
                    'year': year,
                    'content': content
                }

                # save to database
                self.db.insert_one_to_collection(doc=doc,
                                                 collection='publications_raw')

    def general_preprocessing(self, min_bigram_count=5):
        """
			General preprocessing of publications (used for abstracts and full-text)

			Parameters
			----------
			min_bigram_count : int (optional)
				frequency of bigram to occur to include into list of bigrams. Thus lower frequency than min_bigram_count will not be included.
		"""

        logging.info('Start {}'.format(sys._getframe().f_code.co_name))

        # read document collection
        D = self.db.read_collection(collection='publications_raw')

        # setup spacy natural language processing object
        nlp = setup_spacy()

        # loop through the documents and correct content
        for i, d in enumerate(D):

            # check if tokens are already present, if so, skip
            if d.get('tokens') is None:

                # print to console
                print_doc_verbose(i, D.count(), d['journal'], d['year'],
                                  d['title'])

                # get content from document and convert to spacy object
                content = nlp(d['content'])

                # tokenize, lemmatization, remove punctuation, remove single character words
                unigrams = word_tokenizer(content)

                # get entities
                entities = named_entity_recognition(content)

                # get bigrams
                bigrams = get_bigrams(" ".join(unigrams))
                bigrams = [['{} {}'.format(x[0], x[1])] * y
                           for x, y in Counter(bigrams).most_common()
                           if y >= min_bigram_count]
                bigrams = list(itertools.chain(*bigrams))

                d['tokens'] = unigrams + bigrams + entities

                # save dictionary to datbase
                self.db.update_collection(collection='publications_raw', doc=d)

            else:
                logging.debug('Document already tokenized, skipping ...')