def article_supersplit(self, article=None): if article == None: article = self.article_text article = extraction_text_manip.properly_format(article) ''' This function splits a "properly_format"ed article, and returns the variable 'text'. 'text' is structured as: a list of paragraphs, where each paragraph is a list of sentences, where each sentence is a list of words, punctuations as seperate words. ''' text = article.split("\n") #get paragraphs text = extraction_text_manip.remove_empty_from_list(text) for i in range(0, len(text)): text[i] = text[i].split(". ") #get sentences text[i] = remove_empty_from_list(text[i]) for j in range(0, len(text[i])): try: # print "\ntrying NLTK" text[i][j] = nltk.word_tokenize(text[i][j]) # print "\nNLTK success" except Exception: # print "\n\nNLTK failed. Going for backup..." text[i][j] = text[i][j].split(" ") #get words text[i][j] += "." for k in range(0, len(text[i][j])): text[i][j][k] = re.sub(",", "", text[i][j][k]) text[i][j][k] = re.sub(";", "", text[i][j][k]) text[i][j][k] = re.sub("\(", "", text[i][j][k]) text[i][j][k] = re.sub("\)", "", text[i][j][k]) text[i][j][k] = re.sub("\[", "", text[i][j][k]) text[i][j][k] = re.sub("\]", "", text[i][j][k]) text[i][j][k] = re.sub("\{", "", text[i][j][k]) text[i][j][k] = re.sub("\}", "", text[i][j][k]) if text[i][-1][-2][-1] == ".": # print text[i][-1] text[i][-1][-2] = re.sub(".*", text[i][-1][-2][:-1], text[i][-1][-2]) # print "\nreplaced: %s\n\n\n"%text[i][-1] finally: text[i][j] = remove_empty_from_list(text[i][j]) return text
def article_supersplit(self, article=None): if article==None: article=self.article_text article=extraction_text_manip.properly_format(article) ''' This function splits a "properly_format"ed article, and returns the variable 'text'. 'text' is structured as: a list of paragraphs, where each paragraph is a list of sentences, where each sentence is a list of words, punctuations as seperate words. ''' text=article.split("\n") #get paragraphs text = extraction_text_manip.remove_empty_from_list(text) for i in range(0,len(text)): text[i]=text[i].split(". ") #get sentences text[i]=remove_empty_from_list(text[i]) for j in range(0,len(text[i])): try: # print "\ntrying NLTK" text[i][j]=nltk.word_tokenize(text[i][j]) # print "\nNLTK success" except Exception: # print "\n\nNLTK failed. Going for backup..." text[i][j]=text[i][j].split(" ") #get words text[i][j]+="." for k in range(0,len(text[i][j])): text[i][j][k]=re.sub(",", "", text[i][j][k]) text[i][j][k]=re.sub(";", "", text[i][j][k]) text[i][j][k]=re.sub("\(", "", text[i][j][k]) text[i][j][k]=re.sub("\)", "", text[i][j][k]) text[i][j][k]=re.sub("\[", "", text[i][j][k]) text[i][j][k]=re.sub("\]", "", text[i][j][k]) text[i][j][k]=re.sub("\{", "", text[i][j][k]) text[i][j][k]=re.sub("\}", "", text[i][j][k]) if text[i][-1][-2][-1] == ".": # print text[i][-1] text[i][-1][-2]=re.sub(".*", text[i][-1][-2][:-1], text[i][-1][-2]) # print "\nreplaced: %s\n\n\n"%text[i][-1] finally: text[i][j]=remove_empty_from_list(text[i][j]) return text
elif select.lower() == 'p' or select.lower() == 'b': i-=2 ## Go to previous entry delete_query = "DELETE from articles_clean where article_url='%s'"%(articles[i+1][0]) print delete_query conn.execute(delete_query) conn.commit() ## Delete listing for previous entry break; else: ## Step 1: convert input string to a list of numbers nums_str= extraction_text_manip.remove_empty_from_list(select.split(' ')) try: input_companies_nums=list(set([int(x) for x in nums_str])) except Exception: print "\n\tERROR #2: please enter an integer." continue # print input_companies_nums if input_companies_nums == [0]: try: sqliteDefaults.insert_table_sqlite(conn, 'articles_clean', ('company_or_sector', 'article_url'), [ ('None', articles[i][0]) ] )