Exemplo n.º 1
0
    def article_supersplit(self, article=None):
        if article == None:
            article = self.article_text

        article = extraction_text_manip.properly_format(article)
        '''
		This function splits a "properly_format"ed article,
		and returns the variable 'text'.

		'text' is structured as:
			a list of paragraphs,
				where each paragraph is a list of sentences,
					where each sentence is a list of words, punctuations as seperate words.
		'''
        text = article.split("\n")  #get paragraphs
        text = extraction_text_manip.remove_empty_from_list(text)
        for i in range(0, len(text)):
            text[i] = text[i].split(". ")  #get sentences
            text[i] = remove_empty_from_list(text[i])
            for j in range(0, len(text[i])):
                try:
                    # print "\ntrying NLTK"
                    text[i][j] = nltk.word_tokenize(text[i][j])
                    # print "\nNLTK success"
                except Exception:
                    # print "\n\nNLTK failed. Going for backup..."
                    text[i][j] = text[i][j].split(" ")  #get words
                    text[i][j] += "."
                    for k in range(0, len(text[i][j])):
                        text[i][j][k] = re.sub(",", "", text[i][j][k])
                        text[i][j][k] = re.sub(";", "", text[i][j][k])
                        text[i][j][k] = re.sub("\(", "", text[i][j][k])
                        text[i][j][k] = re.sub("\)", "", text[i][j][k])
                        text[i][j][k] = re.sub("\[", "", text[i][j][k])
                        text[i][j][k] = re.sub("\]", "", text[i][j][k])
                        text[i][j][k] = re.sub("\{", "", text[i][j][k])
                        text[i][j][k] = re.sub("\}", "", text[i][j][k])

                    if text[i][-1][-2][-1] == ".":
                        # print text[i][-1]
                        text[i][-1][-2] = re.sub(".*", text[i][-1][-2][:-1],
                                                 text[i][-1][-2])
                    # print "\nreplaced: %s\n\n\n"%text[i][-1]
                finally:
                    text[i][j] = remove_empty_from_list(text[i][j])

        return text
	def article_supersplit(self, article=None):
		if article==None:
			article=self.article_text

		article=extraction_text_manip.properly_format(article)
		'''
		This function splits a "properly_format"ed article,
		and returns the variable 'text'.

		'text' is structured as:
			a list of paragraphs,
				where each paragraph is a list of sentences,
					where each sentence is a list of words, punctuations as seperate words.
		'''
		text=article.split("\n") #get paragraphs
		text = extraction_text_manip.remove_empty_from_list(text)
		for i in range(0,len(text)):
			text[i]=text[i].split(". ") #get sentences
			text[i]=remove_empty_from_list(text[i])
			for j in range(0,len(text[i])):
				try:
					# print "\ntrying NLTK"
					text[i][j]=nltk.word_tokenize(text[i][j])
					# print "\nNLTK success"
				except Exception:
					# print "\n\nNLTK failed. Going for backup..."
					text[i][j]=text[i][j].split(" ") #get words
					text[i][j]+="."
					for k in range(0,len(text[i][j])):
						text[i][j][k]=re.sub(",", "", text[i][j][k])
						text[i][j][k]=re.sub(";", "", text[i][j][k])
						text[i][j][k]=re.sub("\(", "", text[i][j][k])
						text[i][j][k]=re.sub("\)", "", text[i][j][k])
						text[i][j][k]=re.sub("\[", "", text[i][j][k])
						text[i][j][k]=re.sub("\]", "", text[i][j][k])
						text[i][j][k]=re.sub("\{", "", text[i][j][k])
						text[i][j][k]=re.sub("\}", "", text[i][j][k])

					if text[i][-1][-2][-1] == ".":
						# print text[i][-1]
						text[i][-1][-2]=re.sub(".*", text[i][-1][-2][:-1], text[i][-1][-2])
					# print "\nreplaced: %s\n\n\n"%text[i][-1]
				finally:
					text[i][j]=remove_empty_from_list(text[i][j])

		return text
Exemplo n.º 3
0
		elif select.lower() == 'p' or select.lower() == 'b':
			i-=2		## Go to previous entry
			delete_query = "DELETE from articles_clean where article_url='%s'"%(articles[i+1][0])
			print delete_query
			conn.execute(delete_query)	
			conn.commit()
						## Delete listing for previous entry
			break;




		else:
			## Step 1: convert input string to a list of numbers

			nums_str= extraction_text_manip.remove_empty_from_list(select.split(' '))
			try: 
				input_companies_nums=list(set([int(x) for x in nums_str]))
			except Exception:
				print "\n\tERROR #2: please enter an integer."
				continue

			# print input_companies_nums

			if input_companies_nums == [0]:
				try:
					sqliteDefaults.insert_table_sqlite(conn, 
					'articles_clean',
					('company_or_sector', 'article_url'),
					[ ('None', articles[i][0]) ]
					)