def addToDictionary(self, document: Document): for k in document.getFields(): token = document.getField(k).tokenStream(SimpleAnalyzer()) token.reset() while token.incrementToken(): term = token.getTerm() self.dictionary.add(term.text) token.close()
def test_create_from_text(self): text = 'Hello world. Hello France.' doc = Document.create_from_text(text) self.assertEquals(len(doc.tokens), 6, 'erreur sur document') self.assertEquals(len(doc.sentences), 0, 'erreur') filename = os.path.join(DATA_DIR, "test.txt")
def addDocument(self, document: Document): fields = document.getFields() numVectorFields = len(fields.keys()) self.codecs.startDocument(numVectorFields) for fieldInfo in fields: tokenStream = document.getField(fieldInfo).tokenStream(self.config.analyzer) # not assure if tokenStream will choose the right function automatically tokenStream.reset() termDict = defaultdict(list) while tokenStream.incrementToken(): term = tokenStream.getTerm() position = tokenStream.getPosition() termDict[term].append(position) self.codecs.startField(fieldInfo,len(termDict)) for term in termDict.keys(): positions = termDict.get(term) self.codecs.startTerm(term,len(positions)) for position, startOffset, endOffset in positions: self.codecs.addPosition(position, startOffset, endOffset) self.codecs.finishTerm() self.codecs.finishField() self.codecs.finishDocument() self.numOfDocs += 1 # interesting about Python self.dictionary.addToDictionary(document)
def addArticle(self, w, article): global documentMap strTitle = str(article.readline(), 'latin1') strArticle = '' for line in article: strArticle += str(line, 'latin1') strAbstract = summarize('', strTitle, strArticle) document = Document() document.add(StringField('title', strTitle)) document.add(TextField('abstract', strAbstract)) document.add(StringField('contents', strArticle)) w.addDocument(document) self.documentMap.append( (strTitle, strAbstract, strArticle) )
def __init__(self, data=None, degree=1): """ initialize object @param data: path to data file @param degree: degree of polynomial to solve (e.g., quadratic) """ self.independent_variables = Matrix() self.dependent_variables = Matrix() self.coefficients = list() # make sure degree is at least 1 if degree < 1: degree = 1 # if data is specified, load it and set independent and dependent variables accordingly if data is not None: document = Document().open(filePath=data,splitLines=True,splitTabs=True) append_to_independent_variables = self.independent_variables.append append_to_dependent_variables = self.dependent_variables.append # loop through the rows in the document to get data for row in document: new_row = [float(value) for value in row] dependent_variable_row = [new_row[-1]] independent_variable_row = [new_row[0]**i for i in xrange(degree+1)] #print independent_variable_row, new_row, dependent_variable_row #append_to_independent_variables(new_row[:-1]) append_to_independent_variables(independent_variable_row) #append_to_dependent_variables(new_row[-1:]) append_to_dependent_variables(dependent_variable_row) #print self.independent_variables.matrix self.coefficients = self.getCoefficients([self.independent_variables, self.dependent_variables])
def read(self, content: str) -> Document: return Document().create_from_text(content)