def build_lemma_vocab(append_string=''): """ Builds entire vocabulary from a dataframe of reviews. """ import process_text # Restore data from file reviews = pd.read_pickle('../data/pandas/review'+append_string+'.pkl') vocab = {} print len(reviews) count = 0 for index,row in reviews.iterrows(): lemmas = process_text.text2lemmas(row.text) for lemma in vocab: try: vocab[lemma] += 1 except: vocab[lemma] = 1 count += 1 if count % 100==0: print count # Save vocabulary to file vocab = pd.Series(vocab) vocab.to_pickle('../data/pandas/vocab'+append_string+'.pkl') return True
def add_lemmas2pandas(type_string='', append_string=''): """ Adds lemmatized text as an extra column in the reviews and/or sentences databases. type_string = {'review'|'sentences'} determines whether to lemmatize text in pandas dataframe of reviews or sentences """ import process_text # Error handling if type_string not in ['review','sentences']: print "Error in add_lemmas2pandas:" print " type_string must be either 'review' or 'sentences'" print " please try again" return None # Lemmatize each row in the dataframe dataframe = pd.read_pickle('../data/pandas/'+type_string+append_string+'.pkl') lemmatized_text = [] count = 0 for item in dataframe.index: #if count<63400: # count += 1 # continue if count%1000==0: print count thisitem = dataframe.loc[item] lemmatized_text.append( process_text.text2lemmas(thisitem.text) ) count += 1 # Add lemmatized text back as a column in the dataframe lemmatized_text = pd.Series(lemmatized_text,index=dataframe.index) dataframe['lemmas'] = lemmatized_text # Save dataframe to file dataframe.to_pickle('../data/pandas/'+type_string+'_lemmas'+append_string+'.pkl') return True