def __init__ (self, conn, HTML_page_Obj=None, article_file_path=None):
		if HTML_page_Obj==None and article_file_path!=None:
			make_from_file(article_file_path)

		elif HTML_page_Obj!=None and article_file_path==None:
			self.article_url=HTML_page_Obj.url
			website_base_url=extraction_text_manip.extract_website(self.article_url)
			print "\n Making article found on %s"%website_base_url
			self.article_headline=""
			self.article_alt_headline_list=[]
			self.article_text=""
			self.article_date=datetime.date(1970, 1, 1)		## Start of UNIX time
			self.article_time=datetime.time(0,0)

			article_soup=HTML_page_Obj.html_prettify()

			table = sqliteDefaults.verified_select_sqlite(conn=conn, select_query="select * from website_regex where base_url='%s' order by date_of_addition desc;" %website_base_url)	##<-CHANGED

			if table is not None:
				for i in range(0,len(table)):
					try_code=table[i][1]
					# print try_code
					try:
						exec(try_code)
					except Exception:
						print "Something went wrong while executing that code. Trying next code..."
					else:
						self.article_headline=article_headline.strip()
						self.article_alt_headline_list=article_alt_headline_list
						self.article_text=article_text.strip()
						self.article_date = article_date
						self.article_time = article_time
						return
				print "None of the extraction codes for %s have worked on this article. \
						Please re-check them."%website_base_url
Exemplo n.º 2
0
    def __init__(self, conn, HTML_page_Obj=None, article_file_path=None):
        if HTML_page_Obj == None and article_file_path != None:
            make_from_file(article_file_path)

        elif HTML_page_Obj != None and article_file_path == None:
            self.article_url = HTML_page_Obj.url
            website_base_url = extraction_text_manip.extract_website(
                self.article_url)
            print "\n Making article found on %s" % website_base_url
            self.article_headline = ""
            self.article_alt_headline_list = []
            self.article_text = ""
            self.article_date = datetime.date(1970, 1,
                                              1)  ## Start of UNIX time
            self.article_time = datetime.time(0, 0)

            article_soup = HTML_page_Obj.html_prettify()

            table = sqliteDefaults.verified_select_sqlite(
                conn=conn,
                select_query=
                "select * from website_regex where base_url='%s' order by date_of_addition desc;"
                % website_base_url)  ##<-CHANGED

            if table is not None:
                for i in range(0, len(table)):
                    try_code = table[i][1]
                    # print try_code
                    try:
                        exec(try_code)
                    except Exception:
                        print "Something went wrong while executing that code. Trying next code..."
                    else:
                        self.article_headline = article_headline.strip()
                        self.article_alt_headline_list = article_alt_headline_list
                        self.article_text = article_text.strip()
                        self.article_date = article_date
                        self.article_time = article_time
                        return
                print "None of the extraction codes for %s have worked on this article. \
						Please re-check them." % website_base_url
Exemplo n.º 3
0
		ResultPageNumber 	INTEGER,
		URL 				TEXT,
		ResultNumber		INTEGER,
		PRIMARY KEY(Topic, URL)
	);
	''' % db_table_name)
conn.commit()

initial_start_date = to_julian_date_datetime(datetime.now().date())
start_date = 0
end_date = 0

results_sorted_by_date_query = "Select StartDate, EndDate from %s WHERE Topic='%s' ORDER BY StartDate ASC" % (
    db_table_name, topic)

Urls_sorted_by_date = sqliteDefaults.verified_select_sqlite(
    conn, results_sorted_by_date_query, printing=False)

if len(Urls_sorted_by_date) == 0:
    end_date = initial_start_date
    print "\n\tNo results in database on the topic %s\n" % (topic)
else:
    last_extracted_date = Urls_sorted_by_date[0][
        0]  ## Gets the StartDate of the last extracted URL
    end_date = last_extracted_date  ## We must resume googlesearching from this date

    if resume_from != -1:
        end_date = resume_from  ## resume_from should be set to the start date of the latest period in which no urls were extracted. NOTE: once you find a time period in which you DO get some UNIQUE urls, you should not use resume_from in subsequent runs.

    num_time_periods_passed = int(
        round((initial_start_date - end_date) /
              time_period))  ## WHATEVER YOU DO, THE PRODUCT OF time_period*
		PRIMARY KEY(Topic, URL)
	);
	'''%db_table_name)
conn.commit()



initial_start_date = to_julian_date_datetime(datetime.now().date())
start_date=0
end_date=0

results_sorted_by_date_query = "Select StartDate, EndDate from %s WHERE Topic='%s' ORDER BY StartDate ASC"%(db_table_name, topic)



Urls_sorted_by_date = sqliteDefaults.verified_select_sqlite(conn, results_sorted_by_date_query, printing=False)

if len(Urls_sorted_by_date) == 0:
	end_date = initial_start_date
	print "\n\tNo results in database on the topic %s\n"%(topic)
else: 	
	last_extracted_date = Urls_sorted_by_date[0][0]	## Gets the StartDate of the last extracted URL
	end_date = last_extracted_date	## We must resume googlesearching from this date

	if resume_from != -1:
		end_date = resume_from		## resume_from should be set to the start date of the latest period in which no urls were extracted. NOTE: once you find a time period in which you DO get some UNIQUE urls, you should not use resume_from in subsequent runs.


	num_time_periods_passed = int(round((initial_start_date - end_date)/time_period))	## WHATEVER YOU DO, THE PRODUCT OF time_period*
	num_time_periods_remaining = num_time_periods_remaining - num_time_periods_passed
Exemplo n.º 5
0
conn = sqliteDefaults.get_conn("article_extract_db.db")
conn.execute('''CREATE TABLE IF NOT EXISTS `articles_clean` (
	`company_or_sector`		TEXT,
	`article_url`			TEXT,
	PRIMARY KEY(company_or_sector, article_url)
	);
	''')
conn.commit()

company_name = 'Infosys'

articles = sqliteDefaults.verified_select_sqlite(conn,
													"SELECT DISTINCT article_url, company_name, article_headline, article_text, article_date \
														FROM articles \
														WHERE company_name='%s' \
															and article_url not in (select article_url from articles_clean)\
														ORDER BY article_url ASC\
															"%(company_name)
												)


conn2 = sqliteDefaults.get_conn("extracted_search_urls.db")
company_dict = {}
temp_table = sqliteDefaults.verified_select_sqlite(conn2,"SELECT DISTINCT ArticleTopic from articleUrls order by ArticleTopic asc")
for i in range(0,len(temp_table)):
	company_dict[i+1]=temp_table[i][0]


company_dict[100]="Financial Services sector"
company_dict[200]="IT sector"
company_dict[300]="Energy sector"