# loop over each target tweet for i, d in enumerate(D): logging.debug(' - Processing tweet {}/{}'.format(i + 1, D.count())) # check if we have a true label for the target tweet, if so, skip prediction and use true label if d['tweet_id'] in true_labels: # the labels are stored fully written, for example, positive, and we need to go to the coded version, that is 2 for positive d['label'] = get_sentiment_code(true_labels[d['tweet_id']]) else: # infer sentiment label from classifier d['label'] = int(clf.predict([d['text']])[0][0]) # update the document in the database db.update_collection(collection='target_tweets', doc=d) # add to label array labels[i] = (d['tweet_id'], d['label'], get_tweet_type_code(d['tweet_type'])) # increment counter # i +=1 # location for the labels array labels_location = os.path.join('files', 'labels') # make sure directory exists create_directory(labels_location) # save labels array joblib.dump(labels, os.path.join(labels_location, 'labels.pkl'))
class Preprocessing(): def __init__(self): logging.info('Initialized {}'.format(self.__class__.__name__)) # instantiate database self.db = MongoDatabase() # set utf8 encoding reload(sys) sys.setdefaultencoding('utf8') def full_text_preprocessing(self, pdf_folder=os.path.join('files', 'pdf')): """ preprocess full-text publications - convert pdf to plain text - correct for carriage returns - correct for end-of-line hyphenation - remove boilerplate - remove bibliography - remove acknowledgements Parameters ---------- pdf_folder : os.path location where PDF documents are stored """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # read pdf files that need to be converted F = [x for x in read_directory(pdf_folder) if x[-4:] == '.pdf'] # read documents from DB that have already been processed so we can skip them processed_documents = [ '{}-{}-{}'.format(x['journal'], x['year'], x['title']) for x in self.db.read_collection(collection='publications_raw') ] # loop over each file and convert pdf to plain and save meta data to DB for i, f in enumerate(F): # extract meta data from folder structure and file name journal = f.split('/')[2] year = f.split('/')[3] title = f.split('/')[4].replace('-', ' ')[4:-4].strip() # console output print_doc_verbose(i, len(F), journal, year, title) # check if PDF has already been processed if '{}-{}-{}'.format(journal, year, title) in processed_documents: logging.info('PDF document already processed, skipping ...') continue # convert content of PDF to plain text content = pdf_to_plain(f) # check if content could be extracted if content is not None: # fix soft hyphen content = content.replace(u'\xad', "-") # fix em-dash content = content.replace(u'\u2014', "-") # fix en-dash content = content.replace(u'\u2013', "-") # minus sign content = content.replace(u'\u2212', "-") # fix hyphenation that occur just before a new line content = content.replace('-\n', '') # remove new lines/carriage returns content = content.replace('\n', ' ') # correct for ligatures content = content.replace(u'\ufb02', "fl") # fl ligature content = content.replace(u'\ufb01', "fi") # fi ligature content = content.replace(u'\ufb00', "ff") # ff ligature content = content.replace(u'\ufb03', "ffi") # ffi ligature content = content.replace(u'\ufb04', "ffl") # ffl ligature """ Remove boilerplate content: Especially journal publications have lots of boilerplate content on the titlepage. Removing of this is specific for each journal and you can use some regular expressions to identify and remove it. """ """ Remove acknowledgemends and/or references This is a somewhat crude example """ if content.rfind("References") > 0: content = content[:content.rfind("References")] """ Remove acknowledgements """ if content.rfind("Acknowledgment") > 0: content = content[:content.rfind("Acknowledgment")] # prepare dictionary to save into MongoDB doc = { 'journal': journal, 'title': title, 'year': year, 'content': content } # save to database self.db.insert_one_to_collection(doc=doc, collection='publications_raw') def general_preprocessing(self, min_bigram_count=5): """ General preprocessing of publications (used for abstracts and full-text) Parameters ---------- min_bigram_count : int (optional) frequency of bigram to occur to include into list of bigrams. Thus lower frequency than min_bigram_count will not be included. """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # read document collection D = self.db.read_collection(collection='publications_raw') # setup spacy natural language processing object nlp = setup_spacy() # loop through the documents and correct content for i, d in enumerate(D): # check if tokens are already present, if so, skip if d.get('tokens') is None: # print to console print_doc_verbose(i, D.count(), d['journal'], d['year'], d['title']) # get content from document and convert to spacy object content = nlp(d['content']) # tokenize, lemmatization, remove punctuation, remove single character words unigrams = word_tokenizer(content) # get entities entities = named_entity_recognition(content) # get bigrams bigrams = get_bigrams(" ".join(unigrams)) bigrams = [['{} {}'.format(x[0], x[1])] * y for x, y in Counter(bigrams).most_common() if y >= min_bigram_count] bigrams = list(itertools.chain(*bigrams)) d['tokens'] = unigrams + bigrams + entities # save dictionary to datbase self.db.update_collection(collection='publications_raw', doc=d) else: logging.debug('Document already tokenized, skipping ...')