예제 #1
0
def preprocessing(corpus):
    for document in corpus:
        doc = strip_numeric(document)
        doc = remove_stopwords(doc)
        doc = strip_short(doc, 3)
        #doc = stem_text(doc)
        doc = strip_punctuation(doc)
        strip_tags(doc)
        yield gensim.utils.tokenize(doc, lower=True)
예제 #2
0
def compute_tokens(steam_sentences=None, save_to_disk=False, use_spacy=False):
    print('Computing tokens')

    if steam_sentences is None:
        steam_sentences = load_raw_data()

    counter = 0
    num_games = len(steam_sentences)

    steam_tokens = {}

    # You need to have downloaded the model first. Reference: https://spacy.io/models/en#section-en_core_web_lg
    nlp = spacy.load('en_core_web_lg')

    for app_id in steam_sentences:
        game_data = steam_sentences[app_id]
        counter += 1

        if (counter % 1000) == 0:
            print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_data['name']))

        if use_spacy:
            original_str = str(strip_tags(game_data['text']))

            original_str = original_str.replace('\t', ' ')

            # Reference: https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
            original_str = original_str.strip().replace('\n', ' ').replace('\r', ' ')
            original_str = original_str.replace('&amp;', 'and').replace('&gt;', '>').replace('&lt;', '<')

            doc = nlp(original_str)

            ents = [str(entity).strip() for entity in doc.ents]  # Named entities.

            # Keep only words (no numbers, no punctuation).
            # Lemmatize tokens, remove punctuation and remove stopwords.
            doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

            # Add named entities, but only if they are a compound of more than word.
            relevant_entities = [str(entity) for entity in ents if len(entity) > 1]
            doc.extend(relevant_entities)

            game_tokens = doc
        else:
            game_tokens = simple_preprocess(remove_stopwords(strip_tags(game_data['text'])), deacc=True, min_len=3)

        steam_tokens[app_id] = list(game_tokens)

    if save_to_disk:
        with open(get_token_file_name(), 'w') as f:
            json.dump(steam_tokens, f)

    return steam_tokens
예제 #3
0
def getLemmatizedText(name, content, language):
  language = language[:2]
  language = language.lower()
  outText = ""
  if (language):
    if (language=="is"):
      outText = getLemmatizedTextIS(name, content)
      print("IS")
    else:
      outText = lemmatizerMultilanguage.getLemmatizedText(language, name+" "+content)
      print(language.upper())
  else:
    text = name+" "+content
    outText = text.lower().replace('.','.')
    print("ERROR: No language for Lemmatizing text")
  cleaned = re.sub(' +', ' ',outText)
  cleaned = cleaned.replace('\n', '')
  cleaned = cleaned.replace('\r', '')

  cleaned = remove_stopwords(cleaned)
  cleaned = strip_tags(cleaned)
  cleaned = strip_punctuation(cleaned)
  cleaned = strip_numeric(cleaned)
  cleaned = strip_short(cleaned, 1)
  cleaned = strip_multiple_whitespaces(cleaned)
  cleaned = cleaned.lower()

  print("Lemmatized CLEAN: "+cleaned)
  return cleaned
 def pre_process(s):
     s = str(s)
     s = strip_tags(s)
     s = deaccent(s)
     s = strip_multiple_whitespaces(s)
     s = s.lower()
     return s
예제 #5
0
def texts_to_sents(texts,
                   model="en_core_web_sm",
                   remove_stop=True,
                   lemmatize=True):
    """
    transform list of texts to list of sents (list of tokens) and apply
    simple text preprocessing
    """
    texts = [strip_tags(t) for t in texts]
    results = []

    assert spacy is not None, 'please install spacy, i.e., "pip install spacy"'

    try:
        nlp = spacy.load(model, disable=["ner"])
    except Exception as e:
        print(e, "\ntrying to download model...")
        os.system("python -m spacy download " + model)
        nlp = spacy.load(model, disable=["ner"])

    for doc in tqdm(nlp.pipe(texts), total=len(texts), desc="texts to sents"):
        for s in doc.sents:
            results.append([
                simple_preproc(
                    strip_non_alphanum(t.lemma_ if lemmatize else t.text))
                for t in s
                if not any((t.is_punct, t.is_space, remove_stop and t.is_stop))
            ])
    return results
예제 #6
0
def sentence_tokenize_and_word_tokenize_and_remove_stop_words(
        text, tokenizer, stop_word1, stop_word2):
    try:
        if isinstance(text, str):
            sentences = tokenizer.tokenize(text.lower())
        else:
            sentences = tokenizer.tokenize(str(text).lower())
    except UnicodeDecodeError as e:
        return ''
    if len(sentences) == 0:
        return ''
    text_total = ''
    for sentence in sentences:
        words = sentence.split()
        if len(words) == 0:
            continue
        text = ' '.join(filter(lambda x: x not in stop_word1, words))
        try:
            text = preprocessing.strip_punctuation(text)
            text = preprocessing.strip_non_alphanum(text)
            text = preprocessing.strip_numeric(text)
            text = preprocessing.strip_tags(text)
            text = preprocessing.strip_multiple_whitespaces(text)
            words = text.split()
            if len(words) == 0:
                continue
            text = ' '.join(filter(lambda x: x not in stop_word2, words))
            text_total = text_total + text.encode('utf-8') + '#'
        except UnicodeDecodeError as e:
            pass
    return text_total
def clean_text(text):
    """ Cleans the text in the only argument in various steps 
    ARGUMENTS: text: content/title, string
    RETURNS: cleaned text, string"""
    if isfloat(text):
        try:
            if math.isnan(text):
                return ''
        except TypeError:
            print('text: {}'.format(text))
            return ''

    # Replace newlines by space. We want only one doc vector.
    text = text.replace('\n', ' ').lower()
    # Expand contractions: you're to you are and so on.
    # text = contractions.fix(text)
    # Remove stop words
    text = preprocessing.remove_stopwords(text)
    # Remove html tags and numbers: can numbers possible be useful?
    text = preprocessing.strip_tags(preprocessing.strip_numeric(text))
    # Remove punctuation -- all special characters
    text = preprocessing.strip_multiple_whitespaces(
        preprocessing.strip_punctuation(text))
    #text = re.sub(r'[^\w\s]', '', text.lower())
    # STEMMING (Porter) automatically lower-cases as well
    # To stem or not to stem, that is the question
    #text = preprocessing.stem_text(text)
    return text
예제 #8
0
def _normalize_target(s):
    s = s.lower()

    for k, v in contractions.items():
        s.replace(k, v)

    return strip_multiple_whitespaces(strip_punctuation(strip_tags(s))).split()
예제 #9
0
def clean(sx):
    sx = strip_tags(sx)
    sx = strip_numeric(sx)
    sx = re.sub(r'\n', ' ', sx)
    sx = re.sub(r'\[', '', sx)
    sx = re.sub(r'\]', '', sx)
    sx = strip_multiple_whitespaces(sx)
    return sx
예제 #10
0
def _normalize(s):
    s = s.lower()

    for k, v in contractions.items():
        s.replace(k, v)

    return strip_multiple_whitespaces(
        strip_non_alphanum(
            strip_numeric(remove_stopwords(strip_punctuation(
                strip_tags(s)))))).split()
def clean_text(text):
    """ Cleans the text in the only argument in various steps. NOT USED. 
    ARGUMENTS: text: content/title, string
    RETURNS: cleaned text, string"""
    # Expand contractions: you're to you are and so on.
    text = contractions.fix(text)
    # Remove stop words
    text = preprocessing.remove_stopwords(text)
    # Remove html tags
    text = preprocessing.strip_tags(text)
    # Remove punctuation -- all special characters
    text = preprocessing.strip_multiple_whitespaces(preprocessing.strip_punctuation(text))
    return text
예제 #12
0
 def preprocess_text(self, text, tags=False, remove_digits=True):
     """preprocess text: tokenize docs, lowerize text, remove words with length < min_size, remove tags, remove only-digits tokens and remove stopwords"""
     if tags:  # remove tags
         text = strip_tags(text)
     if remove_digits:  # tokenize and remove digits-only tokens
         text = [
             token.text for token in self.tokenizer(text)
             if not self.only_digits(token.text)
         ]
     else:  # tokenize and keep digits-only tokens
         text = [token.text for token in self.tokenizer(text)]
     # return preprocessed doc
     return text
예제 #13
0
def clean_text(text):
    """ Cleans the text in the only argument in various steps 
    ARGUMENTS: text: content/title, string
    RETURNS: cleaned text, string"""
    # Replace newlines by space. We want only one doc vector.
    text = text.replace('\n', ' ').lower()
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Expand contractions: you're to you are and so on.
    text = contractions.fix(text)
    # Remove stop words
    text = preprocessing.remove_stopwords(text)
    # Remove html tags and numbers: can numbers possible be useful?
    text = preprocessing.strip_tags(text)
    # Remove punctuation -- all special characters
    text = preprocessing.strip_multiple_whitespaces(
        preprocessing.strip_punctuation(text))
    return text
예제 #14
0
def word_tokenize(text):
    try:
        if (isinstance(text, str)):
            words = text.lower().split()
        else:
            words = str(text).lower().split()

        if len(words) == 0:
            return ''
        text = ' '.join(words)
        text = preprocessing.strip_punctuation(text)
        text = preprocessing.strip_non_alphanum(text)
        text = preprocessing.strip_numeric(text)
        text = preprocessing.strip_tags(text)
        text = preprocessing.strip_multiple_whitespaces(text)
        return text.encode('utf-8')
    except UnicodeDecodeError as e:
        return ''
예제 #15
0
def gensim_clean_string(textIn, _strip_tags=True, _split_alphanumeric=True, _strip_nonalphanumeric=True,
                        _strip_muliple_whitespace=True, _strip_short=True, _short_charcount_min=3,
                        _strip_punctuation=False, _convert_to_lower = False):
    cleaner = textIn
    if _strip_tags:
        cleaner = strip_tags(textIn)
    if _strip_nonalphanumeric:
        cleaner = strip_non_alphanum(cleaner)
    if _strip_muliple_whitespace:
        cleaner = strip_multiple_whitespaces(cleaner)
    if _split_alphanumeric:
        cleaner = split_alphanum(cleaner)
    if _strip_short:
        cleaner = strip_short(cleaner, minsize=_short_charcount_min)
    if _convert_to_lower:
        cleaner = cleaner.lower()


    return cleaner
예제 #16
0
def get_magid_from_annotation(matchobject):
    """ This takes a found dblp annotation, scrapes the website and gets the title, uses this
    to query mag, returns a mag id with doc id prefix and suffix"""
    cited_dblpurl = matchobject.group(2)
    try:
        res = requests.get(cited_dblpurl, headers=headers)
        # the xml is after 'export record'
        res = res.text[res.text.find('export record'):]
        xml_url_matchobj = xml_p.search(res)
        if xml_url_matchobj is None:
            return 'citation'
        xml_url = xml_url_matchobj.group(2)
        sleep(1)
        print(xml_url)
        xml_res = requests.get(xml_url, headers=headers)
        soup = BeautifulSoup(xml_res.content, 'lxml')
        title_bs4tag = soup.find('title')
        # If it can't find the title for whatever reason, remove the citation
        if title_bs4tag is None:
            return 'citation'
        title = title_bs4tag.string
        if title is None:
            return 'citation'
        title = preprocessing.strip_multiple_whitespaces(
            preprocessing.strip_punctuation(
                preprocessing.strip_tags(title.lower()))).strip()
        pcur.execute(query2, (title, ))
        resultset = pcur.fetchone()
        if resultset is None:
            # If the uuid does not map to a mag id, replace with the word citation.
            #wordindex_magid_dict[i] = 'citation'
            print('not found')
            return 'citation'
        else:
            #print(resultset)
            fetched_magid = resultset['paperid']
            allmagpaperids.add(fetched_magid)
            return '{}{}{}'.format(docid_prefix, fetched_magid, docid_suffix)
    except requests.exceptions.MissingSchema:
        # for GC annotations
        return 'citation'
def map_dblp_to_mag_requests(dblp_url):
    """ Takes a dblp url, gets the title by scraping the website and getting the relevant xml file, and using
    that to get the title. This title is used to map to MAG."""
    try:
        res = requests.get(dblp_url, headers=headers)
        # the xml is after 'export record'
        res = res.text[res.text.find('export record'):]
        xml_url_matchobj = xml_p.search(res)
        if xml_url_matchobj is None:
            return None
        xml_url = xml_url_matchobj.group(2)
        sleep(1) 
        print(xml_url)  
        xml_res = requests.get(xml_url, headers=headers)
        soup = BeautifulSoup(xml_res.content, 'lxml')
        title_bs4tag = soup.find('title')
        # If it can't find the title for whatever reason, remove the citation
        if title_bs4tag is None:
            return None
        title = title_bs4tag.string
        if title is None:
            return None
        title = preprocessing.strip_multiple_whitespaces(
            preprocessing.strip_punctuation(preprocessing.strip_tags(title.lower()))).strip()
        pcur.execute(query2, (title,))
        resultset = pcur.fetchone()
        if resultset is None:
            # If the uuid does not map to a mag id, replace with the word citation.
            #wordindex_magid_dict[i] = 'citation'
            print('not found')
            return None
        else:
            #print(resultset)
            fetched_magid = resultset['paperid']
            #writer.writerow({'dblp_url': dblp_url, 'mag_id': fetched_magid})
            #allmagpaperids.add(fetched_magid)
            return fetched_magid
    except requests.exceptions.MissingSchema:
        # for GC annotations
        return None
예제 #18
0
def word_tokenize_and_remove_stop_words(text, stop_word1, stop_word2):
    try:
        if isinstance(text, str):
            words = text.lower().split()
        else:
            words = str(text).lower().split()

        if len(words) == 0:
            return ''

        text = ' '.join(filter(lambda x: x not in stop_word1, words))
        text = preprocessing.strip_punctuation(text)
        text = preprocessing.strip_non_alphanum(text)
        text = preprocessing.strip_numeric(text)
        text = preprocessing.strip_tags(text)
        text = preprocessing.strip_multiple_whitespaces(text)
        words = text.split()
        if len(words) == 0:
            return ''

        text = ' '.join(filter(lambda x: x not in stop_word2, words))
        return text.encode('utf-8')
    except UnicodeDecodeError as e:
        return ''
예제 #19
0
def sentences_polishing(words_lst, what, deep_polishing=False):

    # calculating char numers for entire review list
    lst_len_start = sum(len(s) for s in words_lst)
    print("Cleaning for list with " + str(lst_len_start) + " chars, for " +
          what)

    # deleting html tags
    words_lst = [strip_tags(x) for x in words_lst]

    # deleting punctuation
    words_lst = [strip_punctuation2(x) for x in words_lst]

    if deep_polishing:
        # Initializing pool for multiprocessing
        pool = Pool(processes=10)

        # for every review, apply function and save result
        words_lst = pool.map(stopWords, words_lst)
        pool.close()
        pool.join()

    # deleting empty reviews
    words_lst = [x for x in words_lst if x]

    # recalculating list char and printing results
    lst_len_end = sum(len(s) for s in words_lst)
    cleaned = lst_len_start - lst_len_end
    print("Deleted " + str(cleaned) + " (" +
          str(int(cleaned / lst_len_start * 100)) + "%) chars, for " + what +
          "\n")

    # freeing memory
    gc.collect()

    return words_lst
def Train_preprocess(yelp_round):

    input_file = 'train_rd%d.tmp' % (yelp_round)
    output_file = './swe_train_rd%d.txt' % (yelp_round)

    fin = open(input_file, 'rb')
    fo = open(output_file, 'wb')

    user_flag = 0
    start = 1
    begin_mark = str('@@@@@begin_mark@@@@@\n')
    for s in fin:
        if s == begin_mark:
            user_flag = 1
            continue
        if user_flag == 1:
            user_flag = 0
            if start != 1:
                fo.write('\n')
            else:
                start = 0
            user_id = s.strip('\n').split()
            if len(user_id) < 1:
                print "there is no user_id following the start_mark!"
            fo.write(user_id[0] + ' ')
            s = ''
            if len(user_id) <= 1:
                continue
            else:
                for i in range(len(user_id) - 1):
                    s = s + user_id[i + 1] + ' '
        try:
            s = s.strip('\n')
            s = preprocessing.strip_punctuation(s)
            s = preprocessing.strip_non_alphanum(s)
            s = preprocessing.strip_numeric(s)
            s = preprocessing.strip_tags(s)
            s = preprocessing.strip_multiple_whitespaces(s)
            s_array = s.encode('utf8').split()
        except UnicodeDecodeError:
            continue
        s = ''
        actual_word_cnt = 0
        for ss in s_array:
            ss = ss.lower()
            actual_word_cnt = actual_word_cnt + 1
            s = s + ss + ' '
        if (actual_word_cnt > 0):
            fo.write(s[:-1])
        else:
            continue

    fin.close()
    fo.close()

    # get user_file and train_file
    if os.path.isfile('./get_user_train_file') == False:
        command = 'gcc get_user_file_w2v_train.c -o get_user_file_w2v_train -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result'
        print command
        os.system(command)

    user_file = 'user_file_rd%d.txt' % (yelp_round)
    w2v_train = './w2v_train_rd%d.txt' % (yelp_round)
    command = './get_user_file_w2v_train -input %s -user %s -word %s' % (
        output_file, user_file, w2v_train)
    print command
    os.system(command)
예제 #21
0
def remove_tags(string_value):
    """Removes all the tags and markup e.g. <p> </p>."""
    return strip_tags(string_value)
예제 #22
0
dictionary = np.load("dic40.npy")
model = Doc2Vec.load('ricardo_col40')
bigram_ = Phraser.load('bigrams_40')
dic_mapping = np.load('dic_mapping.npy')

stop = stopwords.words('english') + list(string.punctuation)
input_ = 'We are trying to use Host Migration with online matchmaker.To make it simple now we are using it now with "Show GUI".Basically we added a custom NetworkMigrationManager, where we only overrided OnClientDisconnectedFromHost, where we call the base function and set a flag to disable any message sending after migration (for testing). See the attached file: HostMigration.cs. We start with 3 players, then the server quits, and host migration happens between the 2 remaining machines by using the UI buttons. It seems like that it happens successfully, there will be a new server, and the another client receives this log: NetworkClient Reconnect::ffff:52.28.11.218:5054 UnityEngine.Networking.NetworkMigrationManager:OnGUI(). But when we try to send the first message (through a chat), we get this error: Send command attempted with no client running [client=hostId: 0 connectionId: 1 isReady: False channel count: 2].UnityEngine.Networking.NetworkBehaviour:SendCommandInternal(NetworkWriter, Int32, String) NetworkPlayer:CallCmdServerChatMessage(PlayerId, String) This is the point where we are stuck...We received this Send command attempted with no client running... message all the time when we try to send any message. What could be the problem? '
#input_  = "I've asked repeatedly about this and given often completely incorrect answers from supposed developers. Its both absurd that this wasnt done years ago, and hasnt been done in 5.4 with the editor now supporting retina. Basically, i'm holding off telling users they need to upgrade from osx 10.7 right now as i'm using Unity 5.2, as 5.3/5.4 has zero additional benefit. Lack of Retina is a deal breaker for me and the reason i won't be using Unity in any future projects ore recommending it to anyone. "
#
#input_ = 'Is there any reason why the same exact scene, with a large realtime spot on the play area, would have much more pixelated hard shadows under Fantastic quality with Very High Resolution shadows when using 5.4.3f1 instead of 5.3.7f1?'
#input_  ='Help! Earlier this year, Allegorithmic released Substance Designer 6 (along with Substance Painter 2.5), which added some great new features and enhanced the functionality of their Substance .sbsar files. Unfortunately, these do not appear to work properly in Unity. Aside from Substances created in Substance Designer 6x just not loading, I find that .sbsar files in Unity can be rendered at no higher a resolution than 2048x2048, despite Allegorithmic\'s format supporting higher resolutions. Checking the software manufacturer\'s forums, they say that unfortunately this is entirely in the hands of Unity. So I\'m posting here and asking, when can we hope to have full compatibility and feature support for Allegorithmic .sbsar files?'


test = input_.lower()
test = pre.strip_punctuation(test)
test = pre.strip_tags(test)
test = pre.strip_numeric(test)
test_final = [i for i in nltk.word_tokenize(test.encode('utf-8')) if i not in stop]
#bigrams  = ngrams(test_final,2)
result_test_bigram = bigram_[test_final]
#print (result_test_bigram)
list_input = []
#for bi in bigrams:
#	for word in bi:
#		list_input.append(word)
#print(list_input)
#for i in range(10):
#print(help(model))
print(result_test_bigram )
#test_vector = model[test_final]
vector_test = model.infer_vector(result_test_bigram,steps=10000)
예제 #23
0
    def __init__(self, documents, speed="fast-learn", workers=None):
        """
        Parameters
        ----------
        documents: list of str
            Input corpus, should be a list of strings.

        speed: string (optional, default 'fast-learn')
            This parameter will determine how fast the model takes to train. The
            fast-learn option is the fastest and will generate the lowest quality
            vectors. The learn option will learn better quality vectors but take
            a longer time to train. The deep-learn option will learn the best quality
            vectors but will take significant time to train. The valid string speed
            options are:
                * fast-learn
                * learn
                * deep-learn

        workers: int (optional)
            The amount of worker threads to be used in training the model. Larger
            amount will lead to faster training.
        """
        # validate inputs
        if speed == "fast-learn":
            hs = 0
            negative = 5
            epochs = 40
        elif speed == "learn":
            hs = 1
            negative = 0
            epochs = 40
        elif speed == "deep-learn":
            hs = 1
            negative = 0
            epochs = 400
        else:
            raise ValueError(
                "speed parameter needs to be one of: fast-learn, learn or deep-learn"
            )

        if workers is None:
            pass
        elif isinstance(workers, int):
            pass
        else:
            raise ValueError("workers needs to be an int")

        self.documents = list(documents)

        # preprocess documents for training - tokenize and remove too long/short words
        train_corpus = [
            TaggedDocument(simple_preprocess(strip_tags(doc), deacc=True), [i])
            for i, doc in enumerate(documents)
        ]

        # create documents and word embeddings with doc2vec
        if workers is None:
            self.model = Doc2Vec(documents=train_corpus,
                                 vector_size=300,
                                 min_count=50,
                                 window=15,
                                 sample=1e-5,
                                 negative=negative,
                                 hs=hs,
                                 epochs=epochs,
                                 dm=0,
                                 dbow_words=1)
        else:
            self.model = Doc2Vec(documents=train_corpus,
                                 vector_size=300,
                                 min_count=50,
                                 window=15,
                                 sample=1e-5,
                                 negative=negative,
                                 hs=hs,
                                 workers=workers,
                                 epochs=epochs,
                                 dm=0,
                                 dbow_words=1)

        # create 5D embeddings of documents
        umap_model = umap.UMAP(n_neighbors=15, n_components=5,
                               metric='cosine').fit(
                                   self.model.docvecs.vectors_docs)

        # find dense areas of document vectors
        cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                                  metric='euclidean',
                                  cluster_selection_method='eom').fit(
                                      umap_model.embedding_)

        # calculate topic vectors from dense areas of documents
        self._create_topic_vectors(cluster.labels_)

        # deduplicate topics
        self._deduplicate_topics()

        # calculate topic sizes and index nearest topic for each document
        self._calculate_topic_sizes()

        # find topic words and scores
        self._find_topic_words_scores()
예제 #24
0
def strip_html(s):
    gsp.strip_tags(s)
    return s
예제 #25
0
파일: run.py 프로젝트: Hippskill/ocs
 def __call__(self, doc):
     striped = prep.strip_punctuation(doc)
     striped = prep.strip_tags(striped)
     striped = prep.strip_multiple_whitespaces(striped).lower()
     return striped
예제 #26
0
 def testStripTags(self):
     self.assertEqual(strip_tags("<i>Hello</i> <b>World</b>!"), "Hello World!")
def NN_preprocess(d_type, yelp_round):
    # preprocessing for sentiment classification using Deep Neural Network
    if d_type == 'train':
        input_file = 'train_rd%d.tmp' % (yelp_round)
        output_file = './NN_train_rd%d.tmp' % (yelp_round)
    elif d_type == 'dev':
        input_file = 'dev_rd%d.tmp' % (yelp_round)
        output_file = './NN_dev_rd%d.tmp' % (yelp_round)
    elif d_type == 'test':
        input_file = 'test_rd%d.tmp' % (yelp_round)
        output_file = './NN_test_rd%d.tmp' % (yelp_round)
    else:
        print 'No such dataset type: %s' % (d_type)
        return None
    command = 'java -jar Split_NN.jar %s %s' % (input_file, output_file)
    print command
    os.system(command)

    # remove stop words
    if d_type == 'train':
        input_file = './NN_train_rd%d.tmp' % (yelp_round)
        output_file = './NN_train_rd%d.txt' % (yelp_round)
    elif d_type == 'dev':
        input_file = './NN_dev_rd%d.tmp' % (yelp_round)
        output_file = './NN_dev_rd%d.txt' % (yelp_round)
    elif d_type == 'test':
        input_file = './NN_test_rd%d.tmp' % (yelp_round)
        output_file = './NN_test_rd%d.txt' % (yelp_round)
    else:
        print 'No such dataset type: %s' % (d_type)
        return None

    stop_file = 'english_stop.txt'

    fin = open(input_file, 'rb')
    fs = open(stop_file, "rb")
    tar_file = open(output_file, 'w+')

    with open(stop_file, "rb") as f:
        for i, l in enumerate(f):
            pass
        total = i + 1

    stop_word1 = ["" for i in range(total)]
    stop_word2 = ["" for i in range(total)]
    cnt1 = 0
    cnt2 = 0
    for l in fs:
        s = l.strip('\n')
        if "'" in s:
            stop_word1[cnt1] = s
            cnt1 = cnt1 + 1
        else:
            stop_word2[cnt2] = s
            cnt2 = cnt2 + 1

    user_flag = 0
    review_flag = 0
    start = 1
    begin_mark = str('@@@@@begin_mark@@@@@\n')
    for s in fin:
        if s == begin_mark:
            user_flag = 1
            continue
        if user_flag == 1:
            user_flag = 0
            if start != 1:
                tar_file.write('\n')
            else:
                start = 0
            user_star = s.strip('\n').split()
            if (len(user_star) < 2):
                print "there is no user_id & star rating following the start_mark!"
                print len(user_star)
                for i in range(len(user_star)):
                    print user_star[i]
            tar_file.write(user_star[0] + '\t\t')
            tar_file.write(user_star[1] + '\t\t')
            continue
        try:
            s_array = s.encode('utf8').split()
            s = ''
            if len(s_array) > 0:
                for ss in s_array:
                    ss = ss.lower()
                    if ss not in stop_word1:
                        s = s + ss + ' '
            else:
                continue
            s = s.strip('\n')
            s = preprocessing.strip_punctuation(s)
            s = preprocessing.strip_non_alphanum(s)
            s = preprocessing.strip_numeric(s)
            s = preprocessing.strip_tags(s)
            s = preprocessing.strip_multiple_whitespaces(s)
            s_array = s.encode('utf8').split()
            s = ''
            actual_word_cnt = 0
            if len(s_array) > 0:
                for ss in s_array:
                    if ss == "RRB" or ss == "LRB" or ss == "LCB" or ss == "RCB":  # -LCB-, -LRB-, -RCB-, -RRB-
                        continue
                    if ss not in stop_word2:
                        s = s + ss + ' '
                        actual_word_cnt = actual_word_cnt + 1
                if (actual_word_cnt > 0):
                    tar_file.write(s[:-1])
                    tar_file.write('#')
            else:
                continue
        except UnicodeDecodeError:
            continue
    fin.close()
    tar_file.close()

    command = 'rm %s' % (input_file)
    #print command
    os.system(command)
예제 #28
0
 def test_strip_tags(self):
     self.assertEqual(strip_tags("<i>Hello</i> <b>World</b>!"),
                      "Hello World!")
def SVM_preprocess(d_type, yelp_round):
    # preprocessing for sentiment classification using SVM
    # remove punctuation, tags, multiple spaces, tags, stop words, convert all words into lower case.
    if d_type == 'train':
        input_file = 'train_rd%d.tmp' % (yelp_round)
        output_file = './SVM_train_rd%d.txt' % (yelp_round)
    elif d_type == 'dev':
        input_file = 'dev_rd%d.tmp' % (yelp_round)
        output_file = './SVM_dev_rd%d.txt' % (yelp_round)
    elif d_type == 'test':
        input_file = 'test_rd%d.tmp' % (yelp_round)
        output_file = './SVM_test_rd%d.txt' % (yelp_round)
    else:
        print 'No such dataset type: %s' % (d_type)
        return None

    stop_file = 'english_stop.txt'

    with open(stop_file, "rb") as f:
        for i, l in enumerate(f):
            pass
        total = i + 1

    fin = open(input_file, "rb")
    fo = open(output_file, "wb")

    stop_word1 = ["" for i in range(total)]
    stop_word2 = ["" for i in range(total)]
    cnt1 = 0
    cnt2 = 0

    with open(stop_file, "rb") as fs:
        for l in fs:
            s = l.strip('\n')
            if "'" in s:
                stop_word1[cnt1] = s
                cnt1 = cnt1 + 1
            else:
                stop_word2[cnt2] = s
                cnt2 = cnt2 + 1

    user_flag = 0
    start = 1
    begin_mark = str('@@@@@begin_mark@@@@@\n')
    for s in fin:
        if s == begin_mark:
            user_flag = 1
            continue
        if user_flag == 1:
            user_flag = 0
            if start != 1:
                fo.write('\n')
            else:
                start = 0
            user_id = s.strip('\n').split()
            if len(user_id) < 2:
                print "there is no user_id & star rating following the start_mark!"
            fo.write(user_id[0] + ' ' + user_id[1] + ' ')
            s = ''
            if len(user_id) <= 2:
                continue
            else:
                for i in range(len(user_id) - 2):
                    s = s + user_id[i + 2] + ' '
                #s = s[:-1]
        try:
            s_array = s.encode('utf8').split()
            s = ''
            if len(s_array) > 0:
                for ss in s_array:
                    ss = ss.lower()
                    if ss not in stop_word1:
                        s = s + ss + ' '
            else:
                continue
            s = s.strip('\n')
            if len(s) > 0:
                s = preprocessing.strip_punctuation(s)
                s = preprocessing.strip_non_alphanum(s)
                s = preprocessing.strip_numeric(s)
                s = preprocessing.strip_tags(s)
                s = preprocessing.strip_multiple_whitespaces(s)
                s_array = s.encode('utf8').split()
                s = ''
                if len(s_array) > 0:
                    for ss in s_array:
                        if ss not in stop_word2:
                            s = s + ss + ' '
                else:
                    continue
            else:
                continue
            if len(s) > 0:
                if s[-1] != ' ':
                    s = s + ' '
            else:
                continue
            fo.write(s)
        except UnicodeDecodeError:
            continue

    fin.close()
    fo.close()
예제 #30
0
파일: Top2Vec.py 프로젝트: yogeshmj/Top2Vec
def default_tokenizer(doc):
    """Tokenize documents for training and remove too long/short words"""
    return simple_preprocess(strip_tags(doc), deacc=True)
def PPL_preprocess(d_type, yelp_round):

    if d_type == 'dev':
        input_file = 'dev_rd%d.tmp' % (yelp_round)
        output_file = 'PPL_dev_rd%d.tmp' % (yelp_round)
    elif d_type == 'test':
        input_file = 'test_rd%d.tmp' % (yelp_round)
        output_file = 'PPL_test_rd%d.tmp' % (yelp_round)
    else:
        print 'No such dataset type: %s' % (d_type)
        return None

    command = 'java -jar Split_PPL.jar %s %s' % (input_file, output_file)
    print command
    os.system(command)

    if d_type == 'dev':
        input_file = 'PPL_dev_rd%d.tmp' % (yelp_round)
        output_file = 'PPL_dev_rd%d.tmp.tmp' % (yelp_round)
    elif d_type == 'test':
        input_file = 'PPL_test_rd%d.tmp' % (yelp_round)
        output_file = 'PPL_test_rd%d.tmp.tmp' % (yelp_round)
    else:
        print 'No such dataset type: %s' % (d_type)
        return None

    fin = open(input_file, 'rb')
    fo = open(output_file, 'wb')

    for s in fin:
        user_id = s.strip('\n').split()
        if len(user_id) <= 1:
            print "there is no word or only user_id in this line!"
            continue
        else:
            fo.write(user_id[0] + ' ')
            s = ''
            for i in range(len(user_id) - 1):
                s = s + user_id[i + 1] + ' '
            s = s[:-1]
            try:
                s = preprocessing.strip_punctuation(s)
                s = preprocessing.strip_non_alphanum(s)
                s = preprocessing.strip_numeric(s)
                s = preprocessing.strip_tags(s)
                s = preprocessing.strip_multiple_whitespaces(s)
                s_array = s.encode('utf8').split()

            except UnicodeDecodeError:
                fo.write('\n')
                continue

            s = ''
            actual_word_cnt = 0
            if len(s_array) > 0:
                for ss in s_array:
                    if ss == "RRB" or ss == "LRB" or ss == "LCB" or ss == "RCB":
                        continue
                    ss = ss.lower()
                    s = s + ss + ' '
                    actual_word_cnt = actual_word_cnt + 1
                if actual_word_cnt > 0:
                    fo.write(s[:-1])
            fo.write('\n')

    fin.close()
    fo.close()

    command = 'rm %s' % (input_file)
    #print command
    os.system(command)

    # select a sentence for each user
    dic = {}
    lower_bound = 8
    upper_bound = 10

    if d_type == 'dev':
        input_file = './PPL_dev_rd%d.tmp.tmp' % (yelp_round)
        output_file = './PPL_dev_rd%d.txt' % (yelp_round)
    elif d_type == 'test':
        input_file = './PPL_test_rd%d.tmp.tmp' % (yelp_round)
        output_file = './PPL_test_rd%d.txt' % (yelp_round)

    fo = open(output_file, "wb")
    user_count = 0

    user_file = 'user_file_rd%d.txt' % (yelp_round)
    with open(user_file, "rb") as fin:
        for line in fin:
            user_id = line.strip('\n')
            if user_id not in dic.keys():
                dic[user_id] = user_count
                user_count = user_count + 1
        total = user_count
    print "total %d user" % (total)
    recorder = [0 for i in range(total)]

    with open(input_file, "rb") as fin:
        for i, line in enumerate(fin):
            array_line = line.strip('\n').split()
            if array_line[0] == "unknown_user_id":
                pass
            else:
                if recorder[dic[array_line[0]]] != 0:
                    pass
                else:
                    if (len(array_line) >= (lower_bound + 1)
                            and len(array_line) <= (upper_bound + 1)):
                        fo.write(line.strip('\n'))
                        fo.write('\n')
                        recorder[dic[array_line[0]]] = 1

    go_on = 0
    count = 0
    for i in range(total):
        if recorder[i] == 0:
            go_on = 1
            count = count + 1

    if go_on == 1:
        with open(input_file, "rb") as fin:
            for i, line in enumerate(fin):
                array_line = line.strip('\n').split()
                if array_line[0] == "unknown_user_id":
                    pass
                else:
                    if recorder[dic[array_line[0]]] != 0:
                        pass
                    else:
                        if (len(array_line) >= (lower_bound + 1 - 1)
                                and len(array_line) <= (upper_bound + 1 + 1)):
                            fo.write(line.strip('\n'))
                            fo.write('\n')
                            recorder[dic[array_line[0]]] = 1

    go_on = 0
    count = 0
    for i in range(total):
        if recorder[i] == 0:
            go_on = 1
            count = count + 1

    if go_on == 1:
        with open(input_file, "rb") as fin:
            for i, line in enumerate(fin):
                array_line = line.strip('\n').split()
                if array_line[0] == "unknown_user_id":
                    pass
                else:
                    if recorder[dic[array_line[0]]] != 0:
                        pass
                    else:
                        if (len(array_line) >= (lower_bound + 1 - 2)
                                and len(array_line) <= (upper_bound + 1 + 2)):
                            fo.write(line.strip('\n'))
                            fo.write('\n')
                            recorder[dic[array_line[0]]] = 1

    go_on = 0
    count = 0
    for i in range(total):
        if recorder[i] == 0:
            go_on = 1
            count = count + 1

    if go_on == 1:
        with open(input_file, "rb") as fin:
            for i, line in enumerate(fin):
                array_line = line.strip('\n').split()
                if array_line[0] == "unknown_user_id":
                    pass
                else:
                    if recorder[dic[array_line[0]]] != 0:
                        pass
                    else:
                        if (len(array_line) >= (lower_bound + 1 - 3)
                                and len(array_line) <= (upper_bound + 1 + 3)):
                            fo.write(line.strip('\n'))
                            fo.write('\n')
                            recorder[dic[array_line[0]]] = 1

    go_on = 0
    count = 0
    for i in range(total):
        if recorder[i] == 0:
            go_on = 1
            count = count + 1

    if go_on == 1:
        with open(input_file, "rb") as fin:
            for i, line in enumerate(fin):
                array_line = line.strip('\n').split()
                if array_line[0] == "unknown_user_id":
                    pass
                else:
                    if recorder[dic[array_line[0]]] != 0:
                        pass
                    else:
                        fo.write(line.strip('\n'))
                        fo.write('\n')
                        recorder[dic[array_line[0]]] = 1

    go_on = 0
    count = 0
    for i in range(total):
        if recorder[i] == 0:
            go_on = 1
            count = count + 1
    if go_on == 1:
        print "ERROR"
    fo.close()

    command = 'rm %s' % (input_file)
    #print command
    os.system(command)