示例#1
0
    def _import_dfr(self, dfr_dir):
        citations = self._import_dfr_metadata(dfr_dir)

        wordcounts_dir = os.path.join(dfr_dir, 'wordcounts')
        for doi in citations.keys():
            try:
                this_text = ''
                for rowdict in \
                    self.parse_csv(os.path.join(wordcounts_dir,
                                   'wordcounts_' + doi.replace('/', '_'
                                   ) + '.CSV')):
                    word = rowdict['WORDCOUNTS']
                    if word in self.stopwords:
                        continue
                    if self.stemming:
                        prestem = word
                        if word not in self.stemmed:
                            self.stemmed[prestem] = stem(self, prestem)
                        word = self.stemmed[prestem]
                    count = int(rowdict['WEIGHT'])

                    this_text += (word + u' ') * count
                if len(this_text) < 20:
                    continue
                yield (doi, this_text)
            except:
                logging.error(doi)
                logging.error(traceback.format_exc())
示例#2
0
    def _output_text(
        self,
        text,
        f,
        filename,
        ):

        text = re.sub(r"[^\w ]+", u' ', text.lower(), flags=re.UNICODE)
        if self.stemming:
            newtext = u''
            for word in text.split():
                if word not in self.stemmed:
                    self.stemmed[word] = stem(self, word)
                if len(self.stemmed[word]) < 4 or word \
                    in self.stopwords:
                    continue

                itemid = self.metadata[filename]['itemID'].split('.')[0]
                self.index[self.stemmed[word]].add(itemid)

                newtext += self.stemmed[word] + u' '
            text = newtext
        else:
            for word in set(text.split()):
                self.index[word].add(itemid)
        f.write(u'\t'.join([filename, self.metadata[filename]['label'],
                text]) + u'\n')
        self.docs.append(filename)
示例#3
0
    def _output_text(
        self,
        text,
        f,
        filename,
    ):

        text = re.sub(r"[^\w ]+", u' ', text.lower(), flags=re.UNICODE)
        if self.stemming:
            newtext = u''
            for word in text.split():
                if word not in self.stemmed:
                    self.stemmed[word] = stem(self, word)
                if len(self.stemmed[word]) < 4 or word \
                    in self.stopwords:
                    continue

                itemid = self.metadata[filename]['itemID'].split('.')[0]
                self.index[self.stemmed[word]].add(itemid)

                newtext += self.stemmed[word] + u' '
            text = newtext
        else:
            itemid = self.metadata[filename]['itemID']
            for word in set(text.split()):
                self.index[word].add(itemid)
        f.write(
            u'\t'.join([filename, self.metadata[filename]['label'], text]) +
            u'\n')
        self.docs.append(filename)
示例#4
0
    def _import_dfr(self, dfr_dir):
        citations = self._import_dfr_metadata(dfr_dir)

        wordcounts_dir = os.path.join(dfr_dir, 'wordcounts')
        for doi in citations.keys():
            try:
                this_text = ''
                for rowdict in \
                    self.parse_csv(os.path.join(wordcounts_dir,
                                   'wordcounts_' + doi.replace('/', '_'
                                   ) + '.CSV')):
                    word = rowdict['WORDCOUNTS']
                    if word in self.stopwords:
                        continue
                    if self.stemming:
                        prestem = word
                        if word not in self.stemmed:
                            self.stemmed[prestem] = stem(self, prestem)
                        word = self.stemmed[prestem]
                    count = int(rowdict['WEIGHT'])

                    this_text += (word + u' ') * count
                if len(this_text) < 20:
                    continue
                yield (doi, this_text)
            except:
                logging.error(doi)
                logging.error(traceback.format_exc())
示例#5
0
 def _ngrams(
     self,
     text,
     n=1,
     stemming=False,
     ):
     text = re.sub(r"[^\w ]+", u'', text.lower(), flags=re.UNICODE)
     if stemming:
         words = [stem(self, word) for word in text.split()]
     else:
         words = [word for word in text.split()]
     total_n = len(words)
     i = 0
     while i < total_n - (n - 1):
         ngram = words[i:i + n]
         if not any([word in self.stopwords or not word.isalpha()
                    for word in ngram]):
             yield u' '.join(ngram)
         i += 1
示例#6
0
 def _ngrams(
     self,
     text,
     n=1,
     stemming=False,
 ):
     text = re.sub(r"[^\w ]+", u'', text.lower(), flags=re.UNICODE)
     if stemming:
         words = [stem(self, word) for word in text.split()]
     else:
         words = [word for word in text.split()]
     total_n = len(words)
     i = 0
     while i < total_n - (n - 1):
         ngram = words[i:i + n]
         if not any([
                 word in self.stopwords or not word.isalpha()
                 for word in ngram
         ]):
             yield u' '.join(ngram)
         i += 1
示例#7
0
	def _output_text(self, text, f, filename):
		text = re.sub(r"[^\w ]+", u' ', text.lower(), flags=re.UNICODE)
		if self.stemming:
			newtext = u''
			for word in text.split():
				if word not in self.stemmed:
					self.stemmed[word] = stem(self, word)
				if len(self.stemmed[word]) < 4 or word in self.stopwords:
					continue

				# if word not in self.index:
				# 	self.index[word] = set()
				if self.stemmed[word] not in self.index:
					self.index[self.stemmed[word]] = set()
				# self.index[word].add(self.metadata[filename]["itemID"].split('.')[0])
				self.index[self.stemmed[word]].add(self.metadata[filename]["itemID"].split('.')[0])

				newtext += self.stemmed[word] + u' '
			text = newtext
		f.write(u'\t'.join([filename, self.metadata[filename]["label"], text]) + u'\n')
		self.docs.append(filename)