def readFromDir(osList):
    """
    This reads the scraped raw data
    """

    textList = []

    for i in range(len(osList)):
        filesList = []
        textArray = []
        for (dirpath, dirnames, filenames) in os.walk(osList[i]):
            filesList.extend(filenames)
            os.chdir(osList[i])
            for _ in range(len(filesList)):
                with open('{}'.format(filesList[_]), 'r',
                          encoding='utf-8') as file:
                    text_str = file.read()
                    textArray.append(text_str.lower())

            text_arr = ','.join(textArray)
            text_arr = strip_punctuation(text_arr)
            text_arr = strip_numeric(text_arr)
            text_arr = strip_non_alphanum(text_arr)
            textList.append(text_arr)

        os.chdir('..')

    return textList
def custom_preprocess(sentence):                         #Define a custom preprocess function for the test documents, this can also be applied to pandas dataframe series
  sentence = sentence.lower()
  no_stopwords = remove_stopwords(sentence)
  tokens = tokenize(no_stopwords)
  no_punctuation = strip_punctuation(no_stopwords)
  unwanted = remove_unwanted(no_punctuation)
  return unwanted
Пример #3
0
def sentence_tokenize_and_word_tokenize_and_remove_stop_words(
        text, tokenizer, stop_word1, stop_word2):
    try:
        if isinstance(text, str):
            sentences = tokenizer.tokenize(text.lower())
        else:
            sentences = tokenizer.tokenize(str(text).lower())
    except UnicodeDecodeError as e:
        return ''
    if len(sentences) == 0:
        return ''
    text_total = ''
    for sentence in sentences:
        words = sentence.split()
        if len(words) == 0:
            continue
        text = ' '.join(filter(lambda x: x not in stop_word1, words))
        try:
            text = preprocessing.strip_punctuation(text)
            text = preprocessing.strip_non_alphanum(text)
            text = preprocessing.strip_numeric(text)
            text = preprocessing.strip_tags(text)
            text = preprocessing.strip_multiple_whitespaces(text)
            words = text.split()
            if len(words) == 0:
                continue
            text = ' '.join(filter(lambda x: x not in stop_word2, words))
            text_total = text_total + text.encode('utf-8') + '#'
        except UnicodeDecodeError as e:
            pass
    return text_total
def make_array_vectorize(text):
  texts=[]
  texts.append(extract_sentences_from_paragraph(text))
  nested_list_len = lambda x: sum(len(list) for list in x)
  source_text_vectors = np.zeros((nested_list_len(texts), 3500))
  vec_idx = 0
  if(type(texts[0]) == list):
      for i in range(len(texts)):
          sentences = texts[i]
          # Get text vector
          for s in sentences:
              sentence_vector = np.array([])
            #   s=remove_stopwords(strip_punctuation(strip_non_alphanum(str(s).lower())))
              s=remove_stopwords(strip_punctuation(strip_non_alphanum(str(s))))  
              s=clean_str(s)
              for w in word_tokenize(s):
                  w=lemmatizer.lemmatize(w)
                  if(model_ft.__contains__(w)==False):
                    continue
                  if(len(sentence_vector) < MAX_SENTENCE_LEN*EMBEDDING_SIZE):
                      sentence_vector = np.append(sentence_vector, model_ft[w])
                  else:
                      break
              while(len(sentence_vector) < MAX_SENTENCE_LEN*EMBEDDING_SIZE):
                  sentence_vector = np.append(sentence_vector,np.zeros(EMBEDDING_SIZE))
              source_text_vectors[vec_idx] = sentence_vector
              vec_idx+=1
  return (source_text_vectors)
Пример #5
0
    def clean_text (self, text_tag, processes = ["urls", "punctuation", "numeric", "lower"]):
        text = self.texts [text_tag]

        #print (text)

        if "urls" in processes:
            text = [re.sub(r"(?:\@|https?\://)\S+", "", str(x)) for x in text]
            text = [re.sub(r' +', ' ', str(x)) for x in text]
        if "stopwords" in processes:
            text = [remove_stopwords (x) for x in text]
        if "punctuation" in processes:
            text = [strip_punctuation(x) for x in text]
        if "numeric" in processes:
            text = [strip_numeric(x) for x in text]

        text = [x.replace('"', "") for x in text]
        text = [x.replace('©', "") for x in text]
        text = [x.replace('\n', " ") for x in text]
        text = [x.replace('\r', ".") for x in text]
        text = [x.replace('QT', " ") for x in text]
        text = [x.replace('RT', " ") for x in text]
        text = [x.replace('#', " ") for x in text]
        text = [strip_multiple_whitespaces(x) for x in text]
        text = [x.strip() for x in text]

        if "lower" in processes:
            text = [x.lower() for x in text]
        # clean_text = [nltk.sent_tokenize (x) for x in  clean_text]

        self.texts[text_tag] = text
Пример #6
0
def getLemmatizedText(name, content, language):
  language = language[:2]
  language = language.lower()
  outText = ""
  if (language):
    if (language=="is"):
      outText = getLemmatizedTextIS(name, content)
      print("IS")
    else:
      outText = lemmatizerMultilanguage.getLemmatizedText(language, name+" "+content)
      print(language.upper())
  else:
    text = name+" "+content
    outText = text.lower().replace('.','.')
    print("ERROR: No language for Lemmatizing text")
  cleaned = re.sub(' +', ' ',outText)
  cleaned = cleaned.replace('\n', '')
  cleaned = cleaned.replace('\r', '')

  cleaned = remove_stopwords(cleaned)
  cleaned = strip_tags(cleaned)
  cleaned = strip_punctuation(cleaned)
  cleaned = strip_numeric(cleaned)
  cleaned = strip_short(cleaned, 1)
  cleaned = strip_multiple_whitespaces(cleaned)
  cleaned = cleaned.lower()

  print("Lemmatized CLEAN: "+cleaned)
  return cleaned
    def process_review_raw_data(self):

        print("Review data pre-processing start...")

        _reviews = []

        with open(config.path2datasets + self.dataset_name, 'r') as f:
            for line in f.readlines():
                review_json = json.loads(line)

                _business_id = review_json['business_id']
                _review_id = review_json['review_id']
                _stars = review_json['stars']
                _text = review_json['text']

                # remove punctuation
                _text = strip_punctuation(_text)
                _text = remove_stopwords(_text)
                _text = _text.lower()

                _reviews.append({
                    'review_id': _review_id,
                    'business_id': _business_id,
                    'stars': _stars,
                    'text': _text
                })

        _reviews = pd.DataFrame(_reviews)
        _reviews.to_csv(config.path2data + self.dataset_name + "." +
                        config.path2reviews)
        _reviews = None

        print("Review data pre-processing DONE")
def noPuncNoNumb(corpora):
    List_No_punct_numb = [[[strip_punctuation(stringa) and strip_numeric(stringa) for stringa in group] for group in
                          corpus] for corpus in corpora]

    # print("\nList_No_punct_numb:")
    # print(List_No_punct_numb)
    return List_No_punct_numb
Пример #9
0
def _normalize_target(s):
    s = s.lower()

    for k, v in contractions.items():
        s.replace(k, v)

    return strip_multiple_whitespaces(strip_punctuation(strip_tags(s))).split()
Пример #10
0
def preprocess_for_lda(tweet, pos_tag=True):
    """
    Processes a tweet for entry into an LDA topic model. Removes hashtags and
    unnecessary characters, filters out stopwords, tokenizes the tweet into
    individual words, and lemmatizes the words.
    """
    tweet = preprocess_tweet_text(tweet)

    # Handle contractions
    tweet = decontract(tweet)

    # Remove punctuation
    tweet = strip_punctuation(tweet)

    # Remove multiple spaces
    tweet = strip_multiple_whitespaces(tweet)

    # Tokenize, cases everything to lowercase, removes emojis
    tokens = simple_preprocess(tweet, max_len=30)

    # Lemmatize tokens
    if pos_tag:
        # This uses pos-tags and is slower but more accurate
        words = lemmatize_sentence(tokens)
    else:
        words = [lemmatizer.lemmatize(word) for word in tokens]

    # Remove stopwords
    words = [word for word in words if word not in FILTER_WORDS]

    return words
Пример #11
0
def search_solr_parse_json(query, collection, search_field):
    """ Searches the arxiv_cs_metadata collection on arxiv_identifier (search_field)
    using the resp. arxiv id as the query, 
    parses the json result and returns it as a list of dictionaries where
    each dictionary corresponds to a record. 
    ARGUMENTS: query, string: each arxiv id
               collection: the Solr collection name (=arxiv_cs_metadata)
               search_field: the Solr field which is queried (=arxiv_identifier)
    RETURNS: docs, list of dicts: the documents (records) returned by Solr 
             AFTER getting the JSON response and parsing it."""
    solr_url = 'http://localhost:8983/solr/' + collection + '/select'
    url_params = {'q': query, 'rows': 1, 'df': search_field}
    solr_response = requests.get(solr_url, params=url_params)
    if solr_response.ok:
        data = solr_response.json()
        # Only one result, so index 0.
        docs = data['response']['docs']
        if docs == []:
            print(docs, query)
            return None, None
        doc = docs[0]
        title = doc.get('title').replace('\n', ' ')
        # Normalize the title
        title = preprocessing.strip_multiple_whitespaces(
            preprocessing.strip_punctuation(title.lower()))
        published_year = doc.get('published_date')[:4]
        return title, published_year
    else:
        print("Invalid response returned from Solr")
        sys.exit(11)
Пример #12
0
    def get_fig_captions(self):
        """
        Get the figures captions of the Notebook document


        Returns
        -------
        captions : list of str
          Figures captions
        """

        captions = []
        cap = ''

        for line in self.get_figs_paragraph().splitlines():

            if line.startswith('-'):
                if cap is not None:
                    captions.append(cap)
                cap = ''

            else:
                cap = cap + ' ' + line

        captions.append(cap)
        captions = [strip_non_alphanum(strip_punctuation(cap.lower()))
                    if not cap == '' else None
                    for cap in captions]

        return captions
Пример #13
0
def preprocessing(text):
    '''Preprocesses a text using standard gensim techniques: 
    removes stopwords, strips short words (1-2 characters), strips numbers, 
    strips http addresses, strips Unicode from emoji etc., lowercases everything, 
    strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming

    input: 
        text: a string
    returns: 
        the preprocessed string.
    '''
    text = text.lower()
    text = preprocess.remove_stopwords(text) # remove stop words
    text = preprocess.strip_short(text) #get rid of short words
    text = preprocess.strip_numeric(text) #get rid of numbers
    p = re.compile(r'(http.*\s)|(http.*$)')
    text = p.sub('',text)
    p = re.compile(r'[^\x00-\x7F]+')
    text = p.sub('',text)
    text = preprocess.strip_multiple_whitespaces(text)
    text = preprocess.strip_punctuation(text)
    text = preprocess.strip_non_alphanum(text)
    text = preprocess.remove_stopwords(text)
    text = preprocess.strip_short(text)
# stemming
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    text = ' '.join(stemmed_words)

    return text
def clean_text(text):
    """ Cleans the text in the only argument in various steps 
    ARGUMENTS: text: content/title, string
    RETURNS: cleaned text, string"""
    if isfloat(text):
        try:
            if math.isnan(text):
                return ''
        except TypeError:
            print('text: {}'.format(text))
            return ''

    # Replace newlines by space. We want only one doc vector.
    text = text.replace('\n', ' ').lower()
    # Expand contractions: you're to you are and so on.
    # text = contractions.fix(text)
    # Remove stop words
    text = preprocessing.remove_stopwords(text)
    # Remove html tags and numbers: can numbers possible be useful?
    text = preprocessing.strip_tags(preprocessing.strip_numeric(text))
    # Remove punctuation -- all special characters
    text = preprocessing.strip_multiple_whitespaces(
        preprocessing.strip_punctuation(text))
    #text = re.sub(r'[^\w\s]', '', text.lower())
    # STEMMING (Porter) automatically lower-cases as well
    # To stem or not to stem, that is the question
    #text = preprocessing.stem_text(text)
    return text
Пример #15
0
def customized_strip(s):
    """
    Static function that strips a given text of most unwanted characters.

    Args:
        s (string): text we want to strip

    Returns:
        string: stripped text
    """

    # strip the comments
    s = s.replace('"', '')
    s = s.replace("'", '')
    s = s.replace('“', '')
    s = s.replace('”', '')

    s = re.sub('https?:\/\/[^\s]+', ' ', s)  # strip urls
    s = re.sub('[\d]+', ' ', s)              # strip numbers

    # strip whitespace
    s = s.replace("\r", ' ').replace("\xa0", ' ')
    s = re.sub('[\s]+', ' ', s)

    s = strip_punctuation(s)
    s = s.lower()
    return s
Пример #16
0
def tokenize(text):
    return [
        token for token in gensim.utils.simple_preprocess(
            gpp.strip_non_alphanum(
                gpp.strip_punctuation(
                    gpp.strip_multiple_whitespaces(gensim.utils.deaccent(
                        text))))) if token not in gpp.STOPWORDS
    ]
def preprocess_text(corpus=[]):
    print("Preprocessing Corpus from list data structure")
    for i, val in enumerate(corpus):  #iterate through list
        corpus[i] = corpus[i].strip('\n')
        corpus[i] = strip_punctuation(corpus[i])
        corpus[i] = strip_non_alphanum(corpus[i])
        corpus[i] = strip_numeric(corpus[i])
    return corpus
Пример #18
0
    def index_metadata(self, table):
        """
        Function that will index metadata of the documents.

        Function will index descriptors, subjects of the documents.
        Since they are important for the document they will be indexed
        by word and as a whole descriptor/subject.

        Parameters:
            table : string
                name of the table 
        
        Returns:
            None
        """

        documents = self.get_documents('postgres', 'dbpass',
                                       'eurlex_environment_only', table)

        for i, document in enumerate(documents):
            celex_number = document.get('document_celex_num')
            descriptor_name = document.get('descriptor_name', None)
            subject_name = document.get('subject_name', None)

            document_id = self.doc2id[celex_number]

            if descriptor_name is not None:
                for word in strip_punctuation(descriptor_name).lower().split():
                    if word not in self.stopwords:
                        self.index[word].add(document_id)

                # We add the whole
                self.index[descriptor_name].add(document_id)

            if subject_name is not None:
                for word in strip_punctuation(subject_name).lower().split():
                    if word not in self.stopwords:
                        self.index[word].add(document_id)

                # We add the whole
                self.index[subject_name].add(document_id)

            if i % 10000 == 0:
                print(f"""
                    Currently finished {i} documents. The size of index is {len(self.index)} 
                """)
Пример #19
0
def process_data(text_array):
    sents = text_array
    for i, sentence in enumerate(sents):
        sents[i] = strip_punctuation(sentence)
        #d = ' '.join(word_tokenize(text_array))
        sents[i] = remove_stopwords(sents[i])
        sents[i] = sents[i].lower()
    return sents
Пример #20
0
def new_processor(token):
    str = unidecode(token)
    str = strip_punctuation(str)
    tokens = sp(str)
    tokens = [PorterStemmer.stem(token) for token in tokens]
    #     str = " ".join(tokens)
    #     str = strip_multiple_whitespaces(str)
    #     str = str.strip(' ')
    return tokens
Пример #21
0
def clean_text(text):
    """ Cleans the text in the only argument in various steps 
    ARGUMENTS: text: content/title, string
    RETURNS: cleaned text, string"""
    # Expand contractions: you're to you are and so on.
    text = contractions.fix(text)
    # Remove punctuation -- all special characters
    text = preprocessing.strip_multiple_whitespaces(preprocessing.strip_punctuation(text))
    return text
Пример #22
0
def preprocessing(corpus):
    for document in corpus:
        doc = strip_numeric(document)
        doc = remove_stopwords(doc)
        doc = strip_short(doc, 3)
        #doc = stem_text(doc)
        doc = strip_punctuation(doc)
        strip_tags(doc)
        yield gensim.utils.tokenize(doc, lower=True)
Пример #23
0
def _normalize(s):
    s = s.lower()

    for k, v in contractions.items():
        s.replace(k, v)

    return strip_multiple_whitespaces(
        strip_non_alphanum(
            strip_numeric(remove_stopwords(strip_punctuation(
                strip_tags(s)))))).split()
Пример #24
0
 def tokenize(self, text):
     """
     Remove punctuation and lowercase text, then
     generate tokens of our chat file.
     """
     return [
         token
         for token in simple_preprocess(strip_punctuation(text.strip()))
         if token not in self.STOPWORDS
     ]
def preprocess_text(corpus,field_name = 'Comment'):
    print("Preprocessing Corpus from pandas data frame")
    for index, row in corpus.iterrows():  #iterate through rows in dataframe
        line = row['Comment'].strip('\n')
        line = strip_punctuation(line)
        line = strip_non_alphanum(line)
        line = strip_numeric(line)
        line = strip_multiple_whitespaces(line)
        line = strip_short(line)
        #add cleaned text line to new dataframe
        corpus.at[index,field_name] = line #set value at row/column in corpus dataframet            
    return corpus
Пример #26
0
def main():
    """ Main function"""
    sconn = db_connect()
    scur = sconn.cursor()
    create_acl_mag_table(sconn)
    reject = open('AdditionalOutputs/no_acl_mag_mapping.txt', 'w')
    with open('Metadata/acl-metadata.txt', 'r',
              encoding='ISO-8859-1') as aclfile:

        content = aclfile.read()
        #'id = {D10-1001}\nauthor = {Rush, Alexander M.; Sontag, David; Collins, Michael John; Jaakkola, Tommi}
        #\ntitle = {On Dual Decomposition and Linear Programming Relaxations for Natural Language Processing}\n
        #venue = {EMNLP}\nyear = {2010}\n\nid = {D10-1002}\nauthor = {Huang, Zhongqiang; Harp'

    lines = content.split('\n\n')
    for line in lines:
        parts = line.split('\n')
        # 'id = {D10-1002}\nauthor = {Huang, Zhongqiang; Harper, Mary P.; Petrov, Slav}\ntitle = {Self-
        # Training with Products of Latent Variable Grammars}\nvenue = {EMNLP}\nyear = {2010}'
        acl_id = parts[0][parts[0].find('{') + 1:parts[0].find('}')]
        title = parts[2][parts[2].find('{') + 1:parts[2].find('}')]
        print(parts[4])
        publishedyear = int(parts[4][parts[4].find('{') +
                                     1:parts[4].find('}')])
        title = preprocessing.strip_multiple_whitespaces(
            preprocessing.strip_punctuation(title.lower())).strip()
        query1 = 'select paperid from papers where papertitle=%s and publishedyear=%s;'
        cur.execute(query1, (title, publishedyear))
        paperid = cur.fetchone()
        if paperid:
            paperid = paperid['paperid']
        query2 = "select paperid from papers where papertitle=%s;"
        if not paperid:
            # Try the query without the year
            cur.execute(query2, (title, ))
            resultset = cur.fetchone()
            if not resultset:
                # Skip this reference, not found in MAG
                reject.write('{}\n'.format(acl_id))
                continue
            paperid = resultset['paperid']

        insert_into_acl_mag(sconn, scur, acl_id, paperid, publishedyear)

    try:
        sconn.commit()
    except:
        print("Something went wrong while committing, attempting to rollback!")
        sconn.rollback()
    scur.execute("select count(*) from acl_mag")
    print("No. of records in db=", scur.fetchall())
    sconn.close()
    reject.close()
Пример #27
0
def dataprocessing(x):
    x = rmvhtmltags(x)
    x = remove_urls(x)
    x = x.lower()
    x = rmvspclcharacter(x)
    x = remove_stopwords(x)
    x = strip_punctuation(x)
    x = strip_multiple_whitespaces(x)
    x = lemmatize_words(x)

    x = ' '.join([re.sub(r'\d+', '', i) for i in word_tokenize(x)])
    return x
def export(type_data='train'):
    print("Extracting data...")
    if type_data.lower() == 'train':
        filename = 'training.1600000.processed.noemoticon.csv'
    elif type_data.lower() == 'test':
        filename = 'testdata.manual.2009.06.14.csv'
    data_file = codecs.open('Sentiment140/' + filename, encoding='ISO-8859-1')
    data = []
    for tweet in data_file.read().split('\n')[:-1]:
        data.append(
            [string for string in tweet.split('"') if string not in ['', ',']])
    data_file.close()
    labels = [(float(tweet[0]) / 4.0) for tweet in data]
    tweets = [tweet[-1] for tweet in data]

    print("Preprocessing data...")
    for i, tweet in enumerate(tweets):
        new_tweet = ' '.join([word for word in tweet.split(' ') if len(word)\
                            > 0 and word[0] not in ['@', '#'] and 'http' not\
                            in word]).strip()
        pro_tweet = [
            word[:-3] if word[-3:] == 'xxx' else word
            for word in preprocess_string(new_tweet.replace('not', 'notxxx'))
        ]
        #pro_tweet = preprocess_string(new_tweet)
        if len(pro_tweet) < 2:
            tweets[i] = strip_punctuation(stem_text(new_tweet.lower())).\
                        strip().split()
        else:
            tweets[i] = pro_tweet
        sys.stdout.write("\r%d tweet(s) pre-processed out of %d\r" %
                         (i + 1, len(tweets)))
        sys.stdout.flush()

    print("\nCleaning data...")
    backup_tweets = np.array(tweets)
    backup_labels = np.array(labels)
    tweets = []
    labels = []
    for i, tweet in enumerate(backup_tweets):
        if len(tweet) >= 2:
            tweets.append(tweet)
            labels.append(backup_labels[i])
    del backup_tweets
    del backup_labels

    # Shuffle the dataset
    data = list(zip(tweets, labels))
    np.random.shuffle(data)
    tweets, labels = list(zip(*data))

    return (tweets, labels)
Пример #29
0
def process_string(string, stemming=True, remove_stopwords=True):

    string = string.lower()
    abbreviations = re.findall(r'(?:[a-z]\.)+', string)
    for abbr in abbreviations:
        string = string.replace(abbr, abbr.replace('.', ''))
    string = pproc.strip_punctuation(string)
    if remove_stopwords:
        string = pproc.remove_stopwords(string)
    if stemming:
        string = pproc.stem_text(string)
    string = string.strip()
    return string
Пример #30
0
def string_processor(token):
#     str = str(token)
    str = unidecode(token)
    # str = strip_custom(str)
    str = remove_stopwords(str)
    str = strip_punctuation(str)
    str = strip_non_alphanum(str)          # will rm puncs 
    tokens = sp(str)
    tokens = [token.lemma_ for token in tokens]       # lemma_ will replace i to -PRON-, sorce code bug
    tokens = [porter_stemmer.stem(token) for token in tokens]
    str = " ".join(tokens)
    str = strip_multiple_whitespaces(str)
    str = str.strip(' ')
    return str
Пример #31
0
def get_text_sentences(filepath, sbd_model):
    tokens_by_sentence = []
    with codecs.open(filepath, encoding='utf8') as f:
        raw_text = f.read()
        #raw_text = raw_text.lower()
        raw_text = strip_multiple_whitespaces(raw_text)
        sentences = splitta.sbd.sbd_text(sbd_model, raw_text, do_tok=False)
        for s in sentences:
            new_s = strip_punctuation(s)
            tokens_by_sentence.append(list(utils.tokenize(new_s, deacc=True, lowercase=True)))
        #print raw_text
        #for filt in self.preprocess:
        #    raw_text = filt(raw_text)
        #text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
    return sentences, tokens_by_sentence
Пример #32
0
def save_word_dict(text):
    proc_text = []

    sentences = text
    sentences = tokenize.sent_tokenize(sentences)

    for sentence in sentences:
        sentence_without_stops = remove_stopwords(sentence)
        sentence_without_stops = stem_text(sentence_without_stops)
        sentence_without_stops = strip_short(sentence_without_stops)
        sentence_without_stops = strip_punctuation(sentence_without_stops)

        proc_sentence = word_tokenize(sentence_without_stops.lower())

        if len(proc_sentence) == 0:
            continue
        proc_text.append(proc_sentence)

    dictionary = corpora.Dictionary(proc_text)
    return [dictionary, proc_text, sentences]
Пример #33
0
def main():
    logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)
    logging.info("Loading sentences dict...")

    sentences_dict_file = "../../experiments/ptr/sunlight_full_train.sentences_dict.p"
    with open(sentences_dict_file) as f:
        sentences_dict = cPickle.load(f)

    logging.info("Loading files")
    ct = 0
    tokens_by_sentence_dict = {}
    for fname, sentence_list in sentences_dict.iteritems():
        ct += 1
        tokens_by_sentence = []
        for s in sentence_list:
            new_s = strip_punctuation(s)
            tokens_by_sentence.append(list(utils.tokenize(new_s, deacc=True, lowercase=True)))
        tokens_by_sentence_dict[fname] = tokens_by_sentence
        if ct % 10000 == 0:
            logging.info("Writing tokens by sentence %s" % ct)
            with open("../../experiments/ptr/sunlight_full_train.tokens_by_sentence.%s.p" % str(ct), "w") as f:
                cPickle.dump(tokens_by_sentence_dict, f)
            tokens_by_sentence_dict = {}

    sys.exit()
    # data_path = '../../data/fcc/sunlight_full_partitions/'

    # create corpus
    output_path = "../../experiments/ptr/"
    filename = "sunlight_full_train"

    # logger.info("Saving files")
    with open(os.path.join(output_path, filename + ".sentences_dict.p"), "w") as f:
        cPickle.dump(sentences_dict, f)

    with open(os.path.join(output_path, filename + ".tokens_by_sentence_dict.p"), "w") as f:
        cPickle.dump(tokens_by_sentence_dict, f)
Пример #34
0
def clean_string(string):
    # Empty strings
    if not string or string == 'N':
        return None

    string = deaccent(string).lower()

    # Remove quote text
    string = re.sub(re_reply_to, '', string)
    string = re.sub(re_quote_line, '', string)

    string = re.sub(re_youtube_link, ' YOUTUBELINK ', string)
    string = re.sub(re_link, ' WEBLINK ', string)
    string = re.sub(re_pol_board, ' pol ', string)
    string = re.sub(re_b_board, ' RANDOMBOARD ', string)
    string = re.sub(re_chan_board, ' CHANBOARD ', string)

    string = strip_punctuation(string)

    # Punctuation to remove completely
    # string = re.sub(re_punc_to_none, '', string)

    # Substitute in this order
    # string = re.sub(re_ellipsis, ' <ELLIPSIS> ', string)
    # string = re.sub(re_echoes, ' <ECHOES> ', string)
    # string = re.sub(re_pol_board, ' <POLBOARD> ', string)
    # string = re.sub(re_numbers, ' <NUMBER> ', string)
    # string = re.sub(re_period, ' <PERIOD> ', string)
    # string = re.sub(re_question, ' <QUESTION> ', string)

    # Replace all other punc to spaces and remove whitespace in between
    # string = re.sub(re_punc_to_space, ' ', string)

    string = ' '.join([word for word in [w.strip() for w in string.split()]])

    return string if string else None