forked from aixeuy/DI-C-project
/
collect_data.py
105 lines (97 loc) · 2.63 KB
/
collect_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
def appendToFile(file_name, line):
f = open(file_name, 'a+') # open file in append mode
f.write(line+"\n")
f.close()
def sentToStr(sent, sen):
return ' '.join([str(w) for w in sent])+sen
def process_semcor():
print 'semcor'
from nltk.corpus import semcor
count=0
word = 'bank'
sen1 = 'depository_financial_institution.n.01'
sen2 = 'bank.n.01'
file_name = 'data/bank_semcor_labelled_tmp.txt'
for f in semcor.fileids():
sents = semcor.sents(f)
tsents = semcor.tagged_sents(f,'sem')
for i in range(len(sents)):
sent = sents[i]
if (word in sent):
if(sen1 in str(tsents[i])):
appendToFile(file_name,sentToStr(sent,'+'))
elif(sen2 in str(tsents[i])):
appendToFile(file_name,sentToStr(sent,'-'))
else:
appendToFile(file_name,sentToStr(sent,'0'))
count = count+1
print count
def process_brown():
print 'brown'
from nltk.corpus import brown
count=0
word = 'bank'
sen1 = 'depository_financial_institution.n.01'
sen2 = 'bank.n.01'
file_name = 'data/bank_brwon_tmp.txt'
for f in brown.fileids():
sents = brown.sents(f)
for i in range(len(sents)):
sent = sents[i]
if (word in sent):
appendToFile(file_name,sentToStr(sent,'0'))
count = count+1
print count
def process_gutenberg():
print 'gutenberg'
from nltk.corpus import gutenberg
count=0
word = 'bank'
sen1 = 'depository_financial_institution.n.01'
sen2 = 'bank.n.01'
file_name = 'data/bank_gutenberg_tmp.txt'
for f in gutenberg.fileids():
sents = gutenberg.sents(f)
for i in range(len(sents)):
sent = sents[i]
if (word in sent):
appendToFile(file_name,sentToStr(sent,'0'))
count = count+1
print count
def process_webtext():
print 'webtext'
from nltk.corpus import webtext
count=0
word = 'bank'
sen1 = 'depository_financial_institution.n.01'
sen2 = 'bank.n.01'
file_name = 'data/bank_webtext_tmp.txt'
for f in webtext.fileids():
sents = webtext.sents(f)
for i in range(len(sents)):
sent = sents[i]
if (word in sent):
appendToFile(file_name,sentToStr(sent,'0'))
count = count+1
print count
def process_reuters():
print 'reuters'
from nltk.corpus import reuters
count=0
word = 'bank'
sen1 = 'depository_financial_institution.n.01'
sen2 = 'bank.n.01'
file_name = 'data/bank_reuters_tmp.txt'
for f in reuters.fileids():
sents = reuters.sents(f)
for i in range(len(sents)):
sent = sents[i]
if (word in sent):
appendToFile(file_name,sentToStr(sent,'0'))
count = count+1
print count
process_semcor()
process_brown()
process_gutenberg()
process_webtext()
process_reuters()