Exemplo n.º 1
0
from bs4 import BeautifulSoup
from __main__ import funcs, attrNames

attrNames.append('hasStrong')

def addStrongAttr(info, fileObject, index):

	soup = BeautifulSoup(fileObject)

	i = 0
	for strong in soup.find_all('strong'):
		i = i+1

	hasStrong = False
	if i:
		hasStrong = True

	info.append(hasStrong)

	return

funcs.append(addStrongAttr)
Exemplo n.º 2
0
from __main__ import funcs, attrNames
import random
import sys
import os
sys.path.append(os.path.dirname(__file__) + '/../data/')
from getRawData import importantFileNum

for i in range(50, 100):
	attrNames.append('noisy' + str(i))

attrNames.append('important')

def addNoisyAndImportanceAttr(info, fileObj, index):

	important = False
	if index < importantFileNum:
		important = True

	for i in range(50, 100):
		if random.randint(1, 100) < i:
			info.append(important)
		else:
			info.append(not important)

	info.append(important)

	return

funcs.append(addNoisyAndImportanceAttr)
Exemplo n.º 3
0
from bs4 import BeautifulSoup
from __main__ import funcs, attrNames

attrNames.append('hasItalic')

def addItalicAttr(info, fileObject, index):

	soup = BeautifulSoup(fileObject)

	i = 0
	for it in soup.find_all('i'):
		i = i+1

	hasItalic = False
	if i:
		hasItalic = True

	info.append(hasItalic)
	
	return

funcs.append(addItalicAttr)
Exemplo n.º 4
0
from bs4 import BeautifulSoup
from __main__ import funcs, attrNames

attrNames.append('hasBold')

def addBoldAttr(info, fileObject, index):

  soup = BeautifulSoup(fileObject)

  i = 0
  for bold in soup.find_all('b'):
    i = i+1

  hasBold = False
  if i:
    hasBold = True

  info.append(hasBold)

  return

funcs.append(addBoldAttr)
Exemplo n.º 5
0
from bs4 import BeautifulSoup
from __main__ import funcs, attrNames

attrNames.append('hasLinks')
attrNames.append('moreThan10Links')
attrNames.append('moreThan100Links')
attrNames.append('moreThan1000Links')

def addLinkNumAttr(info, fileObject, index):

	soup = BeautifulSoup(fileObject)

	i = 0
	for link in soup.find_all('a'):
		i = i+1

	link = False
	link10 = False
	link100 = False
	link1000 = False
	if i > 0:
		link = True
	if i > 10:
		link10 = True	
	if i > 100:
		link100 = True
	if i > 1000:
		link1000 = True

	info.append(link)
	info.append(link10)
Exemplo n.º 6
0
from bs4 import BeautifulSoup
from __main__ import funcs, attrNames

attrNames.append('hasEmphasis')

def addEmphasisAttr(info, fileObject, index):

	soup = BeautifulSoup(fileObject)

	i = 0
	for em in soup.find_all('em'):
		i = i+1

	hasEmphasis = False
	if i:
		hasEmphasis = True

	info.append(hasEmphasis)
	return

funcs.append(addEmphasisAttr)
Exemplo n.º 7
0
from bs4 import BeautifulSoup
from __main__ import funcs, attrNames

attrNames.append('hasPhotos')
attrNames.append('moreThan10Photos')
attrNames.append('moreThan100Photos')

def addPhotoNumAttr(info, fileObject, index):

	soup = BeautifulSoup(fileObject)

	i = 0
	for photo in soup.find_all('img'):
		i = i+1

	photo = False
	photo10 = False
	photo100 = False
	if i > 0:
		photo = True
	if i > 10:
		photo10 = True
	if i > 100:
		photo100 = True

	info.append(photo)
	info.append(photo10)
	info.append(photo100)
	
	return
Exemplo n.º 8
0
patterns = []

for i in range(0, titleMatchNum):
  pattern = ''
  while True:
    for i in range(0, 9):
      alphab = random.choice(alphabs)
      pattern = pattern + alphab
      if random.randint(0,1) == 0:
        break
    if pattern not in patterns:
      patterns.append(pattern)
      break

for i in range(0, titleMatchNum):
  attrNames.append('titleMatchRegex:' + patterns[i])

def addTitleMatchAttr(info, fileObject, index):
  soup = BeautifulSoup(fileObject)
  title = ''
  if soup.title:
    title = soup.title.string

  if title:
    for i in range(0, titleMatchNum):
      m = re.search(patterns[i], title)
      if m:
        info.append(True)
      else:
        info.append(False)
  else: