def __init__(self, dir, corpus_name): self.dir = dir self.corpus_name = corpus_name self.unique_token = defaultdict(int) self.f_token = open(joinp(self.dir, self.corpus_name + ".tok"), 'a') self.token_writer = csv.writer(self.f_token, delimiter='\t', lineterminator='\n', quotechar='', quoting=csv.QUOTE_NONE) self.f_raw_token = open(joinp(self.dir, self.corpus_name + ".raw_token"), 'a') self.raw_token_writer = csv.writer(self.f_raw_token, delimiter='\t', lineterminator='\n', quotechar='', quoting=csv.QUOTE_NONE) self.f_sent = open(joinp(self.dir, self.corpus_name + ".sentences"), 'a') self.sent_writer = csv.writer(self.f_sent, delimiter='\t', lineterminator='\n', quotechar='', quoting=csv.QUOTE_NONE) self.emoticon_re = re.compile(emoticon_string, re.VERBOSE | re.I | re.UNICODE) # TODO: use path from config or cmdline self.standard_abbreviations = load('preprocessor/config/data/abbrev')
def __init__(self, list_of_stopwords): self._stopwords = load(list_of_stopwords)