def _import_dfr(self, dfr_dir): citations = self._import_dfr_metadata(dfr_dir) wordcounts_dir = os.path.join(dfr_dir, 'wordcounts') for doi in citations.keys(): try: this_text = '' for rowdict in \ self.parse_csv(os.path.join(wordcounts_dir, 'wordcounts_' + doi.replace('/', '_' ) + '.CSV')): word = rowdict['WORDCOUNTS'] if word in self.stopwords: continue if self.stemming: prestem = word if word not in self.stemmed: self.stemmed[prestem] = stem(self, prestem) word = self.stemmed[prestem] count = int(rowdict['WEIGHT']) this_text += (word + u' ') * count if len(this_text) < 20: continue yield (doi, this_text) except: logging.error(doi) logging.error(traceback.format_exc())
def _output_text( self, text, f, filename, ): text = re.sub(r"[^\w ]+", u' ', text.lower(), flags=re.UNICODE) if self.stemming: newtext = u'' for word in text.split(): if word not in self.stemmed: self.stemmed[word] = stem(self, word) if len(self.stemmed[word]) < 4 or word \ in self.stopwords: continue itemid = self.metadata[filename]['itemID'].split('.')[0] self.index[self.stemmed[word]].add(itemid) newtext += self.stemmed[word] + u' ' text = newtext else: for word in set(text.split()): self.index[word].add(itemid) f.write(u'\t'.join([filename, self.metadata[filename]['label'], text]) + u'\n') self.docs.append(filename)
def _output_text( self, text, f, filename, ): text = re.sub(r"[^\w ]+", u' ', text.lower(), flags=re.UNICODE) if self.stemming: newtext = u'' for word in text.split(): if word not in self.stemmed: self.stemmed[word] = stem(self, word) if len(self.stemmed[word]) < 4 or word \ in self.stopwords: continue itemid = self.metadata[filename]['itemID'].split('.')[0] self.index[self.stemmed[word]].add(itemid) newtext += self.stemmed[word] + u' ' text = newtext else: itemid = self.metadata[filename]['itemID'] for word in set(text.split()): self.index[word].add(itemid) f.write( u'\t'.join([filename, self.metadata[filename]['label'], text]) + u'\n') self.docs.append(filename)
def _ngrams( self, text, n=1, stemming=False, ): text = re.sub(r"[^\w ]+", u'', text.lower(), flags=re.UNICODE) if stemming: words = [stem(self, word) for word in text.split()] else: words = [word for word in text.split()] total_n = len(words) i = 0 while i < total_n - (n - 1): ngram = words[i:i + n] if not any([word in self.stopwords or not word.isalpha() for word in ngram]): yield u' '.join(ngram) i += 1
def _ngrams( self, text, n=1, stemming=False, ): text = re.sub(r"[^\w ]+", u'', text.lower(), flags=re.UNICODE) if stemming: words = [stem(self, word) for word in text.split()] else: words = [word for word in text.split()] total_n = len(words) i = 0 while i < total_n - (n - 1): ngram = words[i:i + n] if not any([ word in self.stopwords or not word.isalpha() for word in ngram ]): yield u' '.join(ngram) i += 1
def _output_text(self, text, f, filename): text = re.sub(r"[^\w ]+", u' ', text.lower(), flags=re.UNICODE) if self.stemming: newtext = u'' for word in text.split(): if word not in self.stemmed: self.stemmed[word] = stem(self, word) if len(self.stemmed[word]) < 4 or word in self.stopwords: continue # if word not in self.index: # self.index[word] = set() if self.stemmed[word] not in self.index: self.index[self.stemmed[word]] = set() # self.index[word].add(self.metadata[filename]["itemID"].split('.')[0]) self.index[self.stemmed[word]].add(self.metadata[filename]["itemID"].split('.')[0]) newtext += self.stemmed[word] + u' ' text = newtext f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n') self.docs.append(filename)