def read_data(fp): articles = splitLexisNexis_AG(fp) dict1 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv') dict2 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states_shortname.csv') dict3 = read_csvDictionary('/home/team3/Data/Dictionaries/inflow.csv') dict4 = read_csvDictionary('/home/team3/Data/Dictionaries/agriculture.csv') for currentArticle in articles: state_full = "" for state in dict1: if currentArticle.count(state)>0: state_full = state if state_full!= " " and state_full!="" and dict_sum(dict3,currentArticle)>0 and dict_sum(dict4,currentArticle)>0: money=re.findall(r'\$ [0-9]+,[0-9]+,[0-9]+|\$ [0-9]+,[0-9]+|\$ [0-9]+ million|\$ [0-9]+ billion|\$ ?[0.9]+.[0-9]+ million|\$ ?[0.9]+.[0-9]+ billion|[0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar', currentArticle.lower()) if money!=[]: clean_send_to_DB(state_map[state_full],money) state_full = "" for state in dict2: if currentArticle.count(state)>0: state_full = state if state_full!= " " and state_full!="" and dict_sum(dict3,currentArticle)>0 and dict_sum(dict4,currentArticle)>0: money=re.findall(r'\$ [0-9]+,[0-9]+,[0-9]+|\$ [0-9]+,[0-9]+|\$ [0-9]+ million|\$ [0-9]+ billion|\$ ?[0.9]+.[0-9]+ million|\$ ?[0.9]+.[0-9]+ billion|[0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar', currentArticle.lower()) if money!=[]: clean_send_to_DB(state_full,money)
def read_data(fp): articles = splitLexisNexis_AG(fp) dict1 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv') dict2 = read_csvDictionary( '/home/team3/Data/Dictionaries/us_states_shortname.csv') dict3 = read_csvDictionary( '/home/team3/Data/Dictionaries/Project List.csv') dict4 = [] for project in dict3: dict4.append(project.rstrip()) for currentArticle in articles: state_full, project_name, m = "", "", "" for state in dict1: if currentArticle.count(state) > 0: state_full = state project_name, m = "", "" for project in dict4: if currentArticle.count(project) > 0: #print(state,project) project_name = project #Regular Expresion for finding Money money = re.findall( r'\$ ?[0-9]+,[0-9]+,[0-9]+|\$ ?[0-9]+,[0-9]+|\$ ?[0-9]+ million|\$ ?[0-9]+ million dollar|million-dollar\$ ?[0.9]+.[0-9]+ million|\$ [0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar', currentArticle.lower()) if money != []: clean_send_to_DB(state_map[state_full], project_name, money) state_full, project_name, m = "", "", "" for state in dict2: if currentArticle.count(state) > 0: state_full = state project_name, m = "", "" for project in dict4: if currentArticle.count(project) > 0: #print(state,project) project_name = project #Regular Expresion for finding Money money = re.findall( r'\$ ?[0-9]+,[0-9]+,[0-9]+|\$ ?[0-9]+,[0-9]+|\$ ?[0-9]+ million|\$ ?[0-9]+ billion|million dollar|billion dollar|million-dollar|billion-dollar|\$ ?[0.9]+.[0-9]+ million|\$ ?[0.9]+.[0-9]+ billion|[0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar', currentArticle.lower()) if money != []: clean_send_to_DB(state_full, project_name, money) """if state_full!="" and project_name!="" and m!="": print(state_full,project_name,m) if state_full!="": for project in dict4: if currentArticle.count(project)>0: #print(state,project) project_name=project break m=re.findall(r'\$ ?[0-9]+,[0-9]+,[0-9]+|\$ ?[0-9]+,[0-9]+|\$ ?[0-9]{3}|\$ ?[0-9]+ million|\$ ?[0-9]+ billion|million dollar|billion dollar|million-dollar|billion-dollar', currentArticle) #re.match(r'\d+(?:,\d+)?',currentArticle) if m!=[]: print(m)""" """if state_full!="" and project!="":
def read_data(fp, start, end, docname): articles = splitLexisNexis(fp, start, end) i = start for currentArticle in articles: #tokenize #words1 = tokenize_stem_file(currentArticle) #words2 = tokenize_stem_file(doc2) #words1 = remove_stopWords(words1) #words2 = remove_stopWords(words2) #freq1 = get_frequency(words1) #freq2 = get_frequency(words2) dict1 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv') dict2 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states_shortname.csv') dict3 = read_csvDictionary('/home/team3/Data/Dictionaries/inflow.csv') dict4 = read_csvDictionary('/home/team3/Data/Dictionaries/agriculture.csv') state_full="" for state in dict1: if currentArticle.count(state)>0: state_full=state """for state in dict2: if currentArticle.count(state)>0: state_short=state""" if state_full!="" and dict_sum(dict3,currentArticle)>0 and dict_sum(dict4,currentArticle)>0: m=re.search('\-?\$?(?:(?:\d{1,3}(?:,+\d{3}){1,})|\d{4,})\.\d{2}',currentArticle) print(m.group(0)) #for doc1 """dict_sum11 = dict_sum(dict1, currentArticle) dict_sum12 = dict_sum(dict2, currentArticle,'i') dict_sum13 = dict_sum(dict3, currentArticle,'i') dict_sum14 = dict_sum(dict4, currentArticle,'i') """ #for doc2 #dict_sum21 = dict_sum(dict1, doc2) #dict_sum22 = dict_sum(dict2, doc2,'i') #dict_sum23 = dict_sum(dict3, doc2,'i') #dict_sum24 = dict_sum(dict4, doc2,'i') #print(" ","dict1","dict2","dict3","dict4") #print("Doc1",dict_sum11,dict_sum12,dict_sum13,dict_sum14) #print("Doc2",dict_sum21,dict_sum22,dict_sum23,dict_sum24) #print(docname + str(i), dict_sum11,dict_sum12,dict_sum13,dict_sum14) #articleDb(docname + str(i), dict_sum11,dict_sum12,dict_sum13,dict_sum14) i = i+1
def read_data(fp, start, end, docname): articles = splitLexisNexis(fp, start, end) i = start for currentArticle in articles: #tokenize words1 = tokenize_stem_file(currentArticle) #words2 = tokenize_stem_file(doc2) words1 = remove_stopWords(words1) #words2 = remove_stopWords(words2) freq1 = get_frequency(words1) #freq2 = get_frequency(words2) dict1 = read_csvDictionary( '/home/team3/Data/Dictionaries/us_states1.csv') dict2 = read_csvDictionary('/home/team3/Data/Dictionaries/inflow.csv') dict3 = read_csvDictionary('/home/team3/Data/Dictionaries/outflow.csv') dict4 = read_csvDictionary( '/home/team3/Data/Dictionaries/agriculture.csv') #for doc1 dict_sum11 = dict_sum(dict1, currentArticle) dict_sum12 = dict_sum(dict2, currentArticle, 'i') dict_sum13 = dict_sum(dict3, currentArticle, 'i') dict_sum14 = dict_sum(dict4, currentArticle, 'i') #for doc2 #dict_sum21 = dict_sum(dict1, doc2) #dict_sum22 = dict_sum(dict2, doc2,'i') #dict_sum23 = dict_sum(dict3, doc2,'i') #dict_sum24 = dict_sum(dict4, doc2,'i') #print(" ","dict1","dict2","dict3","dict4") #print("Doc1",dict_sum11,dict_sum12,dict_sum13,dict_sum14) #print("Doc2",dict_sum21,dict_sum22,dict_sum23,dict_sum24) articleDb(docname + str(i), dict_sum11, dict_sum12, dict_sum13, dict_sum14) i = i + 1
def read_data(fp): #for each article articles = splitLexisNexis_AG(fp) states = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv') states_short = read_csvDictionary('/home/team3/Data/Dictionaries/us_states_shortname.csv') agriculture = read_csvDictionary('/home/team3/Data/Dictionaries/agriculture.csv') crops = read_csvDictionary('/home/team3/Data/Dictionaries/crops.csv') result_type1 = {} result_type2 = {} for currentArticle in articles: #For full forms re_states = re.findall(r"(?=("+'|'.join(states)+r"))",currentArticle) for each_state in re_states: #replacing short forms if each_state in states: i = states.index(each_state) a = states_short[i] ag_count = len(re.findall(r"(?=("+'|'.join(agriculture)+r"))",currentArticle)) if ag_count > 0: ag = re.findall(r"(?=("+'|'.join(crops)+r"))",currentArticle) result_type1[a] = ag else: continue #For short forms re_states_short = re.findall(r"(?=("+'|'.join(states_short)+r"))",currentArticle) for each_state_short in re_states_short: ag_count_short = len(re.findall(r"(?=("+'|'.join(agriculture)+r"))",currentArticle)) if ag_count_short > 0: ag_short = re.findall(r"(?=("+'|'.join(crops)+r"))",currentArticle) result_type2[each_state_short] = ag_short else: continue '''result = {} for key in (result_type1.keys() | result_type2.keys()): if key in result_type1: result.setdefault(key, []).append(result_type1[key]) if key in result_type2: result.setdefault(key, []).append(result_type2[key])''' '''for k, v in result_type1.items(): print(k, v[0]) for k, v in result_type2.items(): print(k, v) return result_type1, result_type2''' #Inserting data into database try: conn = psycopg2.connect("dbname='team3' user='******' host='localhost' password='******'") cur = conn.cursor() for k, v in result_type1.items(): if len(v) != 0: for i in v: query = "INSERT into ftm.cropType values(%s,%s)" data = (k,i) cur.execute(query, data) conn.commit() conn.close() except: print("error")
def read_data(fp): #for each article articles = splitLexisNexis_AG(fp) states = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv') states_short = read_csvDictionary( '/home/team3/Data/Dictionaries/us_states_shortname.csv') corruption = read_csvDictionary( '/home/team3/Data/Dictionaries/corruption1.csv') agriculture = read_csvDictionary( '/home/team3/Data/Dictionaries/agriculture.csv') result1 = {} result2 = {} cor_count = 0 cor_count_short = 0 #regular expression for checking if corruption and getting count for currentArticle in articles: #For full forms re_states = re.findall(r"(?=(" + '|'.join(states) + r"))", currentArticle) for each_state in re_states: #replacing short forms if each_state in states: i = states.index(each_state) a = states_short[i] ag_count = len( re.findall(r"(?=(" + '|'.join(agriculture) + r"))", currentArticle)) if ag_count > 0: global cor_count cor_count = len( re.findall(r"(?=(" + '|'.join(corruption) + r"))", currentArticle)) #print(cor_count) if cor_count > 0: result1[a] = cor_count else: continue #For short forms re_states_short = re.findall(r"(?=(" + '|'.join(states_short) + r"))", currentArticle) for each_state_short in re_states_short: ag_count_short = len( re.findall(r"(?=(" + '|'.join(agriculture) + r"))", currentArticle)) if ag_count_short > 0: global cor_count_short cor_count_short = len( re.findall(r"(?=(" + '|'.join(corruption) + r"))", currentArticle)) #print(cor_count) if cor_count_short > 0: result2[each_state_short] = cor_count_short #To merge both dictionary values A = Counter(result1) B = Counter(result2) result = A + B '''for k, v in result.items(): print(k, v)''' return result