def parserMonthly(): unparsed_monthly = MonthlyLink.objects.filter(enable=True) #unparsed_monthly = MonthlyLink.objects.all() for monthly in unparsed_monthly: r = urlparse.urlsplit(monthly.link) HTML = '' if monthly.raw_desc: HTML = monthly.raw_desc else: HTML = getHTML(monthly.link) monthly.raw_desc = HTML #monthly.save() if HTML: mcp = MonthlyCollectionParser() mcp.feed(HTML) for link in mcp.links: link_url, link_title = link link_url = urlparse.urlunsplit(('http', r.netloc, link_url, '', '')) daily = DailyLink(monthly_link=monthly, link=link_url, label=link_title) try: daily.save() except IntegrityError: # same move exists. pass monthly.enable = False monthly.save()
def parserMonthly(): unparsed_monthly = MonthlyLink.objects.filter(enable=True) #unparsed_monthly = MonthlyLink.objects.all() for monthly in unparsed_monthly: r = urlparse.urlsplit(monthly.link) HTML = '' if monthly.raw_desc: HTML = monthly.raw_desc else: HTML = getHTML(monthly.link) monthly.raw_desc = HTML #monthly.save() if HTML: mcp = MonthlyCollectionParser() mcp.feed(HTML) for link in mcp.links: link_url, link_title = link link_url = urlparse.urlunsplit( ('http', r.netloc, link_url, '', '')) daily = DailyLink(monthly_link=monthly, link=link_url, label=link_title) try: daily.save() except IntegrityError: # same move exists. pass monthly.enable = False monthly.save()
def parseMonth(month_url): #logger.info("Parse Month:" + str(month_url.link)) url = urlparse.urlsplit(month_url.link) servername = url[0] + "://" + url[1] try: page = urllib2.urlopen(month_url.link) except URLError: raise soup = BeautifulSoup(page, fromEncoding='gbk') #content = soup.prettify() links = soup.findAll('a', {'href': True, 'target': True}, True) count = 0 parsed_count = 0 reobj = re.compile(u"^★㊣最新の[(日本)(亚洲)](.)*♂(.)*♀$") for link in links: content = link.getText() match = reobj.search(content) if match: count = count + 1 logger.info(content) #存储日常链接 linkstr = servername + link.get('href', '') dailyLink = DailyLink(link=linkstr, monthly_link=month_url, label=content) try: dailyLink.save() parsed_count = parsed_count + 1 except IntegrityError: logger.info("URL already existed:...." + linkstr) pass if count > 3: #only parse 2 links every time break else: logger.info(content + " not match!") continue return parsed_count
def parseMonth(month_url): #logger.info("Parse Month:" + str(month_url.link)) url = urlparse.urlsplit(month_url.link) servername = url[0]+"://"+url[1] try: page = urllib2.urlopen(month_url.link) except URLError: raise soup = BeautifulSoup(page,fromEncoding='gbk') #content = soup.prettify() links = soup.findAll('a', {'href':True,'target':True},True) count = 0 parsed_count = 0 reobj = re.compile(u"^★㊣最新の[(日本)(亚洲)](.)*♂(.)*♀$") for link in links: content = link.getText() match = reobj.search(content) if match: count = count+1 logger.info(content) #存储日常链接 linkstr = servername+link.get('href','') dailyLink = DailyLink(link=linkstr,monthly_link=month_url,label=content) try: dailyLink.save() parsed_count = parsed_count + 1 except IntegrityError: logger.info("URL already existed:...." + linkstr) pass if count > 3: #only parse 2 links every time break else: logger.info(content+" not match!") continue return parsed_count