def get_words(self, txt): """ Get words w/o stopwords. Returns a list of words""" dico=self.extract_dictionary(txt) if dico is not None : words=[any2utf8(w) for w in dico if any2utf8(w) not in self.stopwords] else : words = [] return words
def extract_keywords(txt): """ Extract keywords from FR text""" blob = TextBlob(any2utf8(txt), pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()) tags = blob.tags return [tag for tag in tags]
def extract_dictionary(txt): """ Extract from FR text""" blob = TextBlob(any2utf8(txt), pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()) seg_list = blob.words return list(seg_list)
def put(self, id): form = DatasetUpdateForm() if not form.validate_on_submit(): return form.errors, 422 # check if the record exists in the DB dataset = Dataset.query.filter_by(id=id).first() if dataset is None: return 404 # check rights if dataset.user.id != current_user.id : return 401 if len(form.additional_columns.data) : additional_columns = any2utf8(form.additional_columns.data) else : additional_columns = None # validate values csv_corpus = CSVCorpus(dataset.filepath, source_column=form.source_column.data, text_column=form.text_column.data, timestamp_column=form.time_column.data, time_pattern=form.time_pattern.data, additional_columns=additional_columns) try : csv_corpus.validate() except ValueError, e: return e.message, 422
def csv2elastic(dataset): logger.info("loading csv file") if dataset["additional_columns"] : additional_columns = any2utf8(dataset["additional_columns"]) else : additional_columns = dataset["additional_columns"] # open the corpus csv_corpus = CSVCorpus(dataset["filepath"], timestamp_column = dataset["time_column"], time_pattern= dataset["time_pattern"], text_column=dataset["text_column"], source_column= dataset["source_column"], additional_columns=additional_columns ) # ensure that index exists # get_index_info(dataset["index_name"]) d = Dataset.query.filter_by(id=dataset["id"]).first() d.index_state = "processing" db.session.commit() for i, row in enumerate(csv_corpus) : # if i%10 == 0: # # print "emit socket" # socket.emit("progress", json.dumps({"count" : i})) res = elastic.index(dataset["index_name"], "message", row) # change the state to done d.index_state = "done" db.session.commit() return res
def __init__(self, fname, timestamp="created_at", time_pattern="%Y-%m-%dT%H:%M:%S", content="text", origin="user_id", adds=[]): """ Initialize the corpus from a file. """ logger.info("loading corpus from %s" % fname) self.fname = fname self.length = None self.timestamp = timestamp self.time_pattern = time_pattern self.content = content self.origin = origin self.length = 0 if adds is None: self.adds = [] elif type(adds) is str: self.adds = adds.split(",") elif type(adds) is list: self.adds = adds elif type(adds) is unicode: self.adds = any2utf8(adds).split(",") else: raise TypeError("Wrong type for 'adds") # load the first few lines, to guess the CSV dialect head = ''.join(itertools.islice(open(self.fname, "r"), 5)) self.has_headers = csv.Sniffer().has_header(head) self.dialect = csv.Sniffer().sniff(head) logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.has_headers)) # test encoding encoding = chardet.detect(head) self.encoding = encoding['encoding'] if encoding['confidence'] < 0.99 or encoding['encoding'] != 'utf-8': raise TypeError( "File has an unknown encoding : %s. Please try UTF-8 for better compatibility" % encoding['encoding']) logger.info("encoding detected as %s" % (encoding["encoding"])) self.reader = csv.DictReader(open(self.fname, "r"), dialect=self.dialect) # headers are required if not self.has_headers: raise KeyError("CSV file should have headers") # store headers self.headers = self.reader.fieldnames
def validateCSV(self): """ Perform several checks on CSV files * file should have headers * columns should exist * timestamp format should be valid """ # headers are required if not self.has_headers: raise KeyError("CSV file should have headers") # check if required columns exist if any2utf8(self.timestamp) not in self.headers: raise ValueError("Time column '%s' not present in CSV" % self.timestamp) if any2utf8(self.content) not in self.headers: raise ValueError("Text column '%s' not present in CSV" % self.content) if any2utf8(self.origin) not in self.headers: raise ValueError("Author column '%s' not present in CSV" % self.origin) for column_name in self.adds: if any2utf8(column_name) not in self.headers: raise ValueError("Column '%s' not present in CSV" % column_name) # check time format (will raise ValueError) first_line = self.reader.next() timestamp = first_line[any2utf8(self.timestamp)] datetime.strptime(timestamp, any2utf8(self.time_pattern))
def validate(self): """ Perform several checks on CSV files * file should have headers * columns should exist * timestamp format should be valid """ # headers are required if not self.has_headers : raise KeyError("CSV file should have headers") # check if required columns exist if any2utf8(self.timestamp_column) not in self.headers: raise ValueError("Time column '%s' not present in CSV"%self.timestamp_column) if any2utf8(self.text_column) not in self.headers: raise ValueError("Text column '%s' not present in CSV"%self.text_column) if any2utf8(self.source_column) not in self.headers: raise ValueError("Author column '%s' not present in CSV"%self.source_column) for column_name in self.additional_columns : if any2utf8(column_name) not in self.headers: raise ValueError("Column '%s' not present in CSV"%column_name) # check time format (will raise ValueError) first_line = self.reader.next() timestamp = first_line[any2utf8(self.timestamp_column)] datetime.strptime(timestamp, any2utf8(self.time_pattern))
def build_query(q, stopwords): # build query query = {} query_and = [] if q is not None : query_and.append({"keywords" : { "$in" : [ any2utf8(q) ] }}) if stopwords is not None : query_and.append({"keywords" : { "$nin" : stopwords }}) if stopwords is not None or q is not None : query = {"$and": query_and } return query
def __init__(self, fname, timestamp_column="created_at", time_pattern="%Y-%m-%dT%H:%M:%S", text_column="text", source_column="user_id", additional_columns = []): """ Initialize the corpus from a file. """ logger.info("loading corpus from %s" % fname) self.fname = fname self.length = None self.timestamp_column = timestamp_column self.time_pattern = time_pattern self.text_column = text_column self.source_column = source_column self.length = 0 if additional_columns is None : self.additional_columns =[] elif type(additional_columns) is str : self.additional_columns = additional_columns.split(",") elif type(additional_columns) is list : self.additional_columns = additional_columns elif type(additional_columns) is unicode : self.additional_columns = any2utf8(additional_columns).split(",") else : raise TypeError("Wrong type for 'additional_columns") # load the first few lines, to guess the CSV dialect head = ''.join(itertools.islice(open(self.fname, "r"), 5)) self.has_headers = csv.Sniffer().has_header(head) self.dialect = csv.Sniffer().sniff(head) logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.has_headers)) # test encoding encoding = chardet.detect(head) self.encoding = encoding['encoding'] if encoding['confidence'] < 0.99 or encoding['encoding'] != 'utf-8': raise TypeError("File has an unknown encoding : %s. Please try UTF-8 for better compatibility"% encoding['encoding']) logger.info("encoding detected as %s" % (encoding["encoding"])) self.reader = csv.DictReader(open(self.fname, "r"), dialect=self.dialect) # headers are required if not self.has_headers : raise KeyError("CSV file should have headers") # store headers self.headers = self.reader.fieldnames
def __iter__(self): """ Iterate over the corpus, returning a tuple with text as a 'str' and timestamp as a 'datetime' object. """ for index, row in enumerate(self.reader, start=1): result = {} result["text_column"] = any2utf8(row[any2utf8(self.text_column)]) result["time_column"] = datetime.strptime(row[any2utf8(self.timestamp_column)], self.time_pattern) result["source_column"] = row[any2utf8(self.source_column)] for column_name in self.additional_columns : result[any2utf8(column_name)] = any2utf8(row[any2utf8(column_name)]) self.length = self.length + 1 # store the total number of CSV rows yield(result)
def get_topogram(dataset): if dataset["additional_columns"] : additional_columns = any2utf8(dataset["additional_columns"]) else : additional_columns = dataset["additional_columns"] # open the corpus csv_corpus = CSVCorpus(dataset["filepath"], timestamp_column = dataset["time_column"], time_pattern= dataset["time_pattern"], text_column=dataset["text_column"], source_column= dataset["source_column"], additional_columns=additional_columns ) # init NLP nlp = ChineseNLP() # start processing data topogram = NLPPreProcess(corpus=csv_corpus, nlp=nlp) print dataset["index_name"] return topogram
def __iter__(self): """ Iterate over the corpus, returning a tuple with text as a 'str' and timestamp as a 'datetime' object. """ for index, row in enumerate(self.reader, start=1): result = {} result["content"] = any2utf8(row[any2utf8(self.content)]) result["timestamp"] = datetime.strptime( row[any2utf8(self.timestamp)], self.time_pattern) result["origin"] = row[any2utf8(self.origin)] for column_name in self.adds: result[any2utf8(column_name)] = any2utf8( row[any2utf8(column_name)]) self.length = self.length + 1 # store the total number of CSV rows yield (result)
def csv2elastic(dataset): logger.info("loading csv file") if dataset["additional_columns"]: additional_columns = any2utf8(dataset["additional_columns"]) else: additional_columns = dataset["additional_columns"] # open the corpus csv_corpus = CSVCorpus(dataset["filepath"], timestamp_column=dataset["time_column"], time_pattern=dataset["time_pattern"], text_column=dataset["text_column"], source_column=dataset["source_column"], additional_columns=additional_columns) # ensure that index exists # get_index_info(dataset["index_name"]) d = Dataset.query.filter_by(id=dataset["id"]).first() d.index_state = "processing" db.session.commit() for i, row in enumerate(csv_corpus): # if i%10 == 0: # # print "emit socket" # socket.emit("progress", json.dumps({"count" : i})) res = elastic.index(dataset["index_name"], "message", row) # change the state to done d.index_state = "done" db.session.commit() return res
def extract_keywords(txt): """ Extract keywords from Chinese text""" tags = jieba.analyse.extract_tags(txt, 20) return [ any2utf8(tag) for tag in tags]
def test_any2utf8(self): s = "你好".decode('utf-8').encode('utf-8') utf = any2utf8(s) self.assertTrue(isinstance(utf, str))
def extract_keywords(txt): """ Extract keywords from Chinese text""" tags = jieba.analyse.extract_tags(txt, 20) return [any2utf8(tag) for tag in tags]