Пример #1
0
def read_data(fp):
	articles = splitLexisNexis_AG(fp)
        dict1 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv')
        dict2 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states_shortname.csv')
        dict3 = read_csvDictionary('/home/team3/Data/Dictionaries/inflow.csv')
        dict4 = read_csvDictionary('/home/team3/Data/Dictionaries/agriculture.csv')

	for currentArticle in articles:
		state_full = ""
                
		for state in dict1:
			
		        if currentArticle.count(state)>0:
				
				state_full = state
                    
		                if state_full!= " " and state_full!="" and dict_sum(dict3,currentArticle)>0 and dict_sum(dict4,currentArticle)>0:
                                       money=re.findall(r'\$ [0-9]+,[0-9]+,[0-9]+|\$ [0-9]+,[0-9]+|\$ [0-9]+ million|\$ [0-9]+ billion|\$ ?[0.9]+.[0-9]+ million|\$ ?[0.9]+.[0-9]+ billion|[0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar', currentArticle.lower())
                                if money!=[]:
                                       clean_send_to_DB(state_map[state_full],money)

                
                state_full = ""
                for state in dict2:
                        
                        if currentArticle.count(state)>0:
				
				state_full = state
                    
		                if state_full!= " " and state_full!="" and dict_sum(dict3,currentArticle)>0 and dict_sum(dict4,currentArticle)>0:
                                       money=re.findall(r'\$ [0-9]+,[0-9]+,[0-9]+|\$ [0-9]+,[0-9]+|\$ [0-9]+ million|\$ [0-9]+ billion|\$ ?[0.9]+.[0-9]+ million|\$ ?[0.9]+.[0-9]+ billion|[0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar', currentArticle.lower())
                                if money!=[]:
                                       clean_send_to_DB(state_full,money)
def read_data(fp):
    articles = splitLexisNexis_AG(fp)
    dict1 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv')
    dict2 = read_csvDictionary(
        '/home/team3/Data/Dictionaries/us_states_shortname.csv')
    dict3 = read_csvDictionary(
        '/home/team3/Data/Dictionaries/Project List.csv')
    dict4 = []
    for project in dict3:
        dict4.append(project.rstrip())

    for currentArticle in articles:
        state_full, project_name, m = "", "", ""
        for state in dict1:
            if currentArticle.count(state) > 0:
                state_full = state
                project_name, m = "", ""
                for project in dict4:
                    if currentArticle.count(project) > 0:
                        #print(state,project)
                        project_name = project
                        #Regular Expresion for finding Money
                        money = re.findall(
                            r'\$ ?[0-9]+,[0-9]+,[0-9]+|\$ ?[0-9]+,[0-9]+|\$ ?[0-9]+ million|\$ ?[0-9]+ million dollar|million-dollar\$ ?[0.9]+.[0-9]+ million|\$ [0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar',
                            currentArticle.lower())
                        if money != []:
                            clean_send_to_DB(state_map[state_full],
                                             project_name, money)

        state_full, project_name, m = "", "", ""
        for state in dict2:
            if currentArticle.count(state) > 0:
                state_full = state
                project_name, m = "", ""
                for project in dict4:
                    if currentArticle.count(project) > 0:
                        #print(state,project)
                        project_name = project
                        #Regular Expresion for finding Money
                        money = re.findall(
                            r'\$ ?[0-9]+,[0-9]+,[0-9]+|\$ ?[0-9]+,[0-9]+|\$ ?[0-9]+ million|\$ ?[0-9]+ billion|million dollar|billion dollar|million-dollar|billion-dollar|\$ ?[0.9]+.[0-9]+ million|\$ ?[0.9]+.[0-9]+ billion|[0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar',
                            currentArticle.lower())
                        if money != []:
                            clean_send_to_DB(state_full, project_name, money)
        """if state_full!="" and project_name!="" and m!="":
                        print(state_full,project_name,m)
                if state_full!="":
                        for project in dict4:
                                if currentArticle.count(project)>0:
                                        #print(state,project)
                                        project_name=project
                                        break
                m=re.findall(r'\$ ?[0-9]+,[0-9]+,[0-9]+|\$ ?[0-9]+,[0-9]+|\$ ?[0-9]{3}|\$ ?[0-9]+ million|\$ ?[0-9]+ billion|million dollar|billion dollar|million-dollar|billion-dollar', currentArticle)
                #re.match(r'\d+(?:,\d+)?',currentArticle)
                if m!=[]:
                        print(m)"""
        """if state_full!="" and project!="":
Пример #3
0
def read_data(fp, start, end, docname):
	articles = splitLexisNexis(fp, start, end)
	i = start
	for currentArticle in articles:
		#tokenize
		#words1 = tokenize_stem_file(currentArticle)
		#words2 = tokenize_stem_file(doc2)
		
		#words1 = remove_stopWords(words1)
		#words2 = remove_stopWords(words2)
		#freq1 = get_frequency(words1)
		#freq2 = get_frequency(words2)
		
		dict1 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv')
		dict2 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states_shortname.csv')
		dict3 = read_csvDictionary('/home/team3/Data/Dictionaries/inflow.csv')
		dict4 = read_csvDictionary('/home/team3/Data/Dictionaries/agriculture.csv')

                state_full=""
                for state in dict1:
                    if currentArticle.count(state)>0:
                        state_full=state
                """for state in dict2:
                    if currentArticle.count(state)>0:
                        state_short=state"""
                if state_full!="" and dict_sum(dict3,currentArticle)>0 and dict_sum(dict4,currentArticle)>0:
                    m=re.search('\-?\$?(?:(?:\d{1,3}(?:,+\d{3}){1,})|\d{4,})\.\d{2}',currentArticle)
                    print(m.group(0))
		#for doc1
		"""dict_sum11 = dict_sum(dict1, currentArticle)
		dict_sum12 = dict_sum(dict2, currentArticle,'i')
		dict_sum13 = dict_sum(dict3, currentArticle,'i')
		dict_sum14 = dict_sum(dict4, currentArticle,'i')
		"""
		#for doc2
		#dict_sum21 = dict_sum(dict1, doc2)
		#dict_sum22 = dict_sum(dict2, doc2,'i')
		#dict_sum23 = dict_sum(dict3, doc2,'i')
		#dict_sum24 = dict_sum(dict4, doc2,'i')
		
		#print("    ","dict1","dict2","dict3","dict4")
		#print("Doc1",dict_sum11,dict_sum12,dict_sum13,dict_sum14)
		#print("Doc2",dict_sum21,dict_sum22,dict_sum23,dict_sum24)
		
		#print(docname + str(i), dict_sum11,dict_sum12,dict_sum13,dict_sum14)
		#articleDb(docname + str(i), dict_sum11,dict_sum12,dict_sum13,dict_sum14)
		i = i+1
Пример #4
0
def read_data(fp, start, end, docname):
    articles = splitLexisNexis(fp, start, end)
    i = start
    for currentArticle in articles:
        #tokenize
        words1 = tokenize_stem_file(currentArticle)
        #words2 = tokenize_stem_file(doc2)

        words1 = remove_stopWords(words1)
        #words2 = remove_stopWords(words2)
        freq1 = get_frequency(words1)
        #freq2 = get_frequency(words2)

        dict1 = read_csvDictionary(
            '/home/team3/Data/Dictionaries/us_states1.csv')
        dict2 = read_csvDictionary('/home/team3/Data/Dictionaries/inflow.csv')
        dict3 = read_csvDictionary('/home/team3/Data/Dictionaries/outflow.csv')
        dict4 = read_csvDictionary(
            '/home/team3/Data/Dictionaries/agriculture.csv')

        #for doc1
        dict_sum11 = dict_sum(dict1, currentArticle)
        dict_sum12 = dict_sum(dict2, currentArticle, 'i')
        dict_sum13 = dict_sum(dict3, currentArticle, 'i')
        dict_sum14 = dict_sum(dict4, currentArticle, 'i')

        #for doc2
        #dict_sum21 = dict_sum(dict1, doc2)
        #dict_sum22 = dict_sum(dict2, doc2,'i')
        #dict_sum23 = dict_sum(dict3, doc2,'i')
        #dict_sum24 = dict_sum(dict4, doc2,'i')

        #print("    ","dict1","dict2","dict3","dict4")
        #print("Doc1",dict_sum11,dict_sum12,dict_sum13,dict_sum14)
        #print("Doc2",dict_sum21,dict_sum22,dict_sum23,dict_sum24)

        articleDb(docname + str(i), dict_sum11, dict_sum12, dict_sum13,
                  dict_sum14)
        i = i + 1
Пример #5
0
def read_data(fp):
	#for each article
	articles = splitLexisNexis_AG(fp)
	states = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv')
	states_short = read_csvDictionary('/home/team3/Data/Dictionaries/us_states_shortname.csv')
	agriculture = read_csvDictionary('/home/team3/Data/Dictionaries/agriculture.csv')
	crops = read_csvDictionary('/home/team3/Data/Dictionaries/crops.csv')
	
	result_type1 = {}
	result_type2 = {}
	
	for currentArticle in articles:
		#For full forms
		re_states = re.findall(r"(?=("+'|'.join(states)+r"))",currentArticle)
		for each_state in re_states:
			#replacing short forms
			if each_state in states:
				i = states.index(each_state)
				a = states_short[i]
			ag_count = len(re.findall(r"(?=("+'|'.join(agriculture)+r"))",currentArticle))
			if ag_count > 0:
				ag = re.findall(r"(?=("+'|'.join(crops)+r"))",currentArticle)
				result_type1[a] = ag
			
			else:
				continue
				
		#For short forms
		re_states_short = re.findall(r"(?=("+'|'.join(states_short)+r"))",currentArticle)
		for each_state_short in re_states_short:
			ag_count_short = len(re.findall(r"(?=("+'|'.join(agriculture)+r"))",currentArticle))
			if ag_count_short > 0:
				ag_short = re.findall(r"(?=("+'|'.join(crops)+r"))",currentArticle)
				result_type2[each_state_short] = ag_short
				
			else:
				continue
				
	'''result = {}
	for key in (result_type1.keys() | result_type2.keys()):
		if key in result_type1: result.setdefault(key, []).append(result_type1[key])
		if key in result_type2: result.setdefault(key, []).append(result_type2[key])'''
		
	'''for k, v in result_type1.items():
		print(k, v[0])
		
	for k, v in result_type2.items():
		print(k, v)
		
	return result_type1, result_type2'''
			



#Inserting data into database
	try:
		conn = psycopg2.connect("dbname='team3' user='******' host='localhost' password='******'")
		cur = conn.cursor()
		for k, v in result_type1.items():
			if len(v) != 0:
				for i in v:
					query = "INSERT into ftm.cropType values(%s,%s)"
					data = (k,i)
					cur.execute(query, data)
		conn.commit()
		conn.close()
		
	except:
		print("error")
Пример #6
0
def read_data(fp):
    #for each article
    articles = splitLexisNexis_AG(fp)
    states = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv')
    states_short = read_csvDictionary(
        '/home/team3/Data/Dictionaries/us_states_shortname.csv')
    corruption = read_csvDictionary(
        '/home/team3/Data/Dictionaries/corruption1.csv')
    agriculture = read_csvDictionary(
        '/home/team3/Data/Dictionaries/agriculture.csv')
    result1 = {}
    result2 = {}
    cor_count = 0
    cor_count_short = 0

    #regular expression for checking if corruption and getting count
    for currentArticle in articles:
        #For full forms
        re_states = re.findall(r"(?=(" + '|'.join(states) + r"))",
                               currentArticle)
        for each_state in re_states:
            #replacing short forms
            if each_state in states:
                i = states.index(each_state)
                a = states_short[i]
            ag_count = len(
                re.findall(r"(?=(" + '|'.join(agriculture) + r"))",
                           currentArticle))
            if ag_count > 0:
                global cor_count
                cor_count = len(
                    re.findall(r"(?=(" + '|'.join(corruption) + r"))",
                               currentArticle))
                #print(cor_count)
            if cor_count > 0:
                result1[a] = cor_count
            else:
                continue

        #For short forms
        re_states_short = re.findall(r"(?=(" + '|'.join(states_short) + r"))",
                                     currentArticle)
        for each_state_short in re_states_short:
            ag_count_short = len(
                re.findall(r"(?=(" + '|'.join(agriculture) + r"))",
                           currentArticle))
            if ag_count_short > 0:
                global cor_count_short
                cor_count_short = len(
                    re.findall(r"(?=(" + '|'.join(corruption) + r"))",
                               currentArticle))
                #print(cor_count)
            if cor_count_short > 0:
                result2[each_state_short] = cor_count_short

    #To merge both dictionary values
    A = Counter(result1)
    B = Counter(result2)

    result = A + B
    '''for k, v in result.items():
		print(k, v)'''

    return result