示例#1
0
	def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None):
		textBlocks=[]
		for idx,words in enumerate(wordsArr):
			if type(words)==int:
				numWords=words
				text=' '.join(self.defaultWords[:numWords])
			else:
				text=words
				numWords=text.count(' ')
			try:
				numAnchorWords=numAnchorWordsArr[idx]
			except TypeError,IndexError:
				numAnchorWords=0
			block=TextBlock(text,set(),numWords,numAnchorWords,0,0,idx)
			try:
				block.setIsContent(isContentArr[idx])
			except TypeError,IndexError:
				pass
示例#2
0
 def makedoc(self,
             wordsArr,
             numAnchorWordsArr=None,
             isContentArr=None,
             labelArr=None):
     textBlocks = []
     for idx, words in enumerate(wordsArr):
         if type(words) == int:
             numWords = words
             text = ' '.join(self.defaultWords[:numWords])
         else:
             text = words
             numWords = text.count(' ')
         try:
             numAnchorWords = numAnchorWordsArr[idx]
         except TypeError, IndexError:
             numAnchorWords = 0
         block = TextBlock(text, set(), numWords, numAnchorWords, 0, 0, idx)
         try:
             block.setIsContent(isContentArr[idx])
         except TypeError, IndexError:
             pass
示例#3
0
    def makedoc(self,
                wordsArr,
                numAnchorWordsArr=None,
                isContentArr=None,
                labelArr=None):
        textBlocks = []
        for idx, words in enumerate(wordsArr):
            if type(words) == int:
                numWords = words
                text = ' '.join(self.defaultWords[:numWords])
            else:
                text = words
                numWords = text.count(' ')
            try:
                numAnchorWords = numAnchorWordsArr[idx]
            except (TypeError, IndexError):
                numAnchorWords = 0
            block = TextBlock(text, set(), numWords, numAnchorWords, 0, 0, idx)
            try:
                block.setIsContent(isContentArr[idx])
            except (TypeError, IndexError):
                pass
            try:
                label = labelArr[idx]
                if label is None:
                    pass
                elif type(label) == list:
                    for l in label:
                        block.addLabel(l)
                else:
                    block.addLabel(label)
            except (TypeError, IndexError):
                pass

            textBlocks.append(block)

        return TextDocument(textBlocks)
示例#4
0
	def test_merge(self):
		block1=TextBlock("AA BB CC ",set([0]),3,3,3,1,0)
		block2=TextBlock("DD EE FF GG HH II JJ .",set([1]),6,0,6,2,1)
		block1.addLabels(DefaultLabels.MIGHT_BE_CONTENT)
		block2.addLabels(DefaultLabels.ARTICLE_METADATA)
		block1.mergeNext(block2)
		self.assertEqual(block1.getText(),"AA BB CC \nDD EE FF GG HH II JJ .")
		self.assertEqual(block1.getNumWords(),9)
		self.assertEqual(block1.getNumWordsInAnchorText(),3)
		self.assertAlmostEqual(block1.getLinkDensity(),1.0/3.0)
		self.assertEqual(block1.getTextDensity(),3)
		self.assertEqual(block1.getLabels(),set([DefaultLabels.MIGHT_BE_CONTENT,DefaultLabels.ARTICLE_METADATA]))
		self.assertEqual(block1.getOffsetBlocksStart(),0)
		self.assertEqual(block1.getOffsetBlocksEnd(),1)
示例#5
0
 def test_merge(self):
     block1 = TextBlock("AA BB CC ", set([0]), 3, 3, 3, 1, 0)
     block2 = TextBlock("DD EE FF GG HH II JJ .", set([1]), 6, 0, 6, 2, 1)
     block1.addLabels(DefaultLabels.MIGHT_BE_CONTENT)
     block2.addLabels(DefaultLabels.ARTICLE_METADATA)
     block1.mergeNext(block2)
     self.assertEqual(block1.getText(), "AA BB CC \nDD EE FF GG HH II JJ .")
     self.assertEqual(block1.getNumWords(), 9)
     self.assertEqual(block1.getNumWordsInAnchorText(), 3)
     self.assertAlmostEqual(block1.getLinkDensity(), 1.0 / 3.0)
     self.assertEqual(block1.getTextDensity(), 3)
     self.assertEqual(
         block1.getLabels(),
         set([
             DefaultLabels.MIGHT_BE_CONTENT, DefaultLabels.ARTICLE_METADATA
         ]))
     self.assertEqual(block1.getOffsetBlocksStart(), 0)
     self.assertEqual(block1.getOffsetBlocksEnd(), 1)