示例#1
0
 def addToDictionary(self, document: Document):
     for k in document.getFields():
         token = document.getField(k).tokenStream(SimpleAnalyzer())
         token.reset()
         while token.incrementToken():
             term = token.getTerm()
             self.dictionary.add(term.text)
         token.close()
示例#2
0
    def test_create_from_text(self):
        text = 'Hello world. Hello France.'
        doc = Document.create_from_text(text)
        self.assertEquals(len(doc.tokens), 6, 'erreur sur document')
        self.assertEquals(len(doc.sentences), 0, 'erreur')

        filename = os.path.join(DATA_DIR, "test.txt")
示例#3
0
 def addDocument(self, document: Document):
     fields = document.getFields()
     numVectorFields = len(fields.keys())
     self.codecs.startDocument(numVectorFields)
     for fieldInfo in fields:
         tokenStream = document.getField(fieldInfo).tokenStream(self.config.analyzer)  # not assure if tokenStream will choose the right function automatically
         tokenStream.reset()
         termDict = defaultdict(list)
         while tokenStream.incrementToken():
             term = tokenStream.getTerm()
             position = tokenStream.getPosition()
             termDict[term].append(position)
         self.codecs.startField(fieldInfo,len(termDict))
         for term in termDict.keys():
             positions = termDict.get(term)
             self.codecs.startTerm(term,len(positions))
             for position, startOffset, endOffset in positions:
                 self.codecs.addPosition(position, startOffset, endOffset)
             self.codecs.finishTerm()
         self.codecs.finishField()
     self.codecs.finishDocument()
     self.numOfDocs += 1  # interesting about Python
     self.dictionary.addToDictionary(document)
示例#4
0
文件: Reuters.py 项目: wtl-zju/IR_ZJU
    def addArticle(self, w, article):
        global documentMap

        strTitle = str(article.readline(), 'latin1')
        strArticle = ''
        for line in article:
            strArticle += str(line, 'latin1')
        strAbstract = summarize('', strTitle, strArticle)

        document = Document()
        document.add(StringField('title', strTitle))
        document.add(TextField('abstract', strAbstract))
        document.add(StringField('contents', strArticle))

        w.addDocument(document)
        self.documentMap.append( (strTitle, strAbstract, strArticle) )
	def __init__(self, data=None, degree=1):
		""" initialize object
			@param	data: path to data file
			@param	degree: degree of polynomial to solve (e.g., quadratic)
		"""

		self.independent_variables = Matrix()
		self.dependent_variables = Matrix()
		self.coefficients = list()

		# make sure degree is at least 1
		if degree < 1:
			degree = 1

		# if data is specified, load it and set independent and dependent variables accordingly
		if data is not None:
			document = Document().open(filePath=data,splitLines=True,splitTabs=True)

			append_to_independent_variables = self.independent_variables.append
			append_to_dependent_variables = self.dependent_variables.append

			# loop through the rows in the document to get data
			for row in document:
				new_row = [float(value) for value in row]
				
				dependent_variable_row = [new_row[-1]]
				independent_variable_row = [new_row[0]**i for i in xrange(degree+1)]
				#print independent_variable_row, new_row, dependent_variable_row

				#append_to_independent_variables(new_row[:-1])
				append_to_independent_variables(independent_variable_row)
				#append_to_dependent_variables(new_row[-1:])
				append_to_dependent_variables(dependent_variable_row)

			#print self.independent_variables.matrix
			self.coefficients = self.getCoefficients([self.independent_variables, self.dependent_variables])
示例#6
0
 def read(self, content: str) -> Document:
     return Document().create_from_text(content)