def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None): textBlocks=[] for idx,words in enumerate(wordsArr): if type(words)==int: numWords=words text=' '.join(self.defaultWords[:numWords]) else: text=words numWords=text.count(' ') try: numAnchorWords=numAnchorWordsArr[idx] except TypeError,IndexError: numAnchorWords=0 block=TextBlock(text,set(),numWords,numAnchorWords,0,0,idx) try: block.setIsContent(isContentArr[idx]) except TypeError,IndexError: pass
def makedoc(self, wordsArr, numAnchorWordsArr=None, isContentArr=None, labelArr=None): textBlocks = [] for idx, words in enumerate(wordsArr): if type(words) == int: numWords = words text = ' '.join(self.defaultWords[:numWords]) else: text = words numWords = text.count(' ') try: numAnchorWords = numAnchorWordsArr[idx] except TypeError, IndexError: numAnchorWords = 0 block = TextBlock(text, set(), numWords, numAnchorWords, 0, 0, idx) try: block.setIsContent(isContentArr[idx]) except TypeError, IndexError: pass
def makedoc(self, wordsArr, numAnchorWordsArr=None, isContentArr=None, labelArr=None): textBlocks = [] for idx, words in enumerate(wordsArr): if type(words) == int: numWords = words text = ' '.join(self.defaultWords[:numWords]) else: text = words numWords = text.count(' ') try: numAnchorWords = numAnchorWordsArr[idx] except (TypeError, IndexError): numAnchorWords = 0 block = TextBlock(text, set(), numWords, numAnchorWords, 0, 0, idx) try: block.setIsContent(isContentArr[idx]) except (TypeError, IndexError): pass try: label = labelArr[idx] if label is None: pass elif type(label) == list: for l in label: block.addLabel(l) else: block.addLabel(label) except (TypeError, IndexError): pass textBlocks.append(block) return TextDocument(textBlocks)
def test_merge(self): block1=TextBlock("AA BB CC ",set([0]),3,3,3,1,0) block2=TextBlock("DD EE FF GG HH II JJ .",set([1]),6,0,6,2,1) block1.addLabels(DefaultLabels.MIGHT_BE_CONTENT) block2.addLabels(DefaultLabels.ARTICLE_METADATA) block1.mergeNext(block2) self.assertEqual(block1.getText(),"AA BB CC \nDD EE FF GG HH II JJ .") self.assertEqual(block1.getNumWords(),9) self.assertEqual(block1.getNumWordsInAnchorText(),3) self.assertAlmostEqual(block1.getLinkDensity(),1.0/3.0) self.assertEqual(block1.getTextDensity(),3) self.assertEqual(block1.getLabels(),set([DefaultLabels.MIGHT_BE_CONTENT,DefaultLabels.ARTICLE_METADATA])) self.assertEqual(block1.getOffsetBlocksStart(),0) self.assertEqual(block1.getOffsetBlocksEnd(),1)
def test_merge(self): block1 = TextBlock("AA BB CC ", set([0]), 3, 3, 3, 1, 0) block2 = TextBlock("DD EE FF GG HH II JJ .", set([1]), 6, 0, 6, 2, 1) block1.addLabels(DefaultLabels.MIGHT_BE_CONTENT) block2.addLabels(DefaultLabels.ARTICLE_METADATA) block1.mergeNext(block2) self.assertEqual(block1.getText(), "AA BB CC \nDD EE FF GG HH II JJ .") self.assertEqual(block1.getNumWords(), 9) self.assertEqual(block1.getNumWordsInAnchorText(), 3) self.assertAlmostEqual(block1.getLinkDensity(), 1.0 / 3.0) self.assertEqual(block1.getTextDensity(), 3) self.assertEqual( block1.getLabels(), set([ DefaultLabels.MIGHT_BE_CONTENT, DefaultLabels.ARTICLE_METADATA ])) self.assertEqual(block1.getOffsetBlocksStart(), 0) self.assertEqual(block1.getOffsetBlocksEnd(), 1)